## Note: Elasticsearch for Nearest Neighbour Query / Proximity Search

In [176]:
%%capture
%pip install elasticsearch

In [177]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pandas as pd
import json

Let's load configuration data from a file

In [179]:
from dotenv import dotenv_values
config_filename = 'es_kibana.cfg'
config = dotenv_values(config_filename)
if config['ELASTIC_PASSWORD'] is None:
    print(f"No {config_filename} file not found!")


Let's connect to Elasticsearch

In [12]:
#
client = Elasticsearch([{'host': 'localhost', 'port': 9200, "scheme": "http"}]
                       , basic_auth=("elastic", ELASTIC_PASSWORD)
                       )

Let's print the metadata attached to Elasticsearch client object

In [None]:
client.info()

In [181]:
answer = client.info()
print(type(answer))

# Custom made; elastic_transport.ObjectApiResponse is a dict
for k in answer.keys():
    if k == 'version':
        print(f"{k} :")
        for k2 in answer[k]:
            print(f"\t{k2} : {answer[k][k2]}")
    else:
        print(f"{k} : {answer[k]}")

<class 'elastic_transport.ObjectApiResponse'>
name : 8bb798e2995b
cluster_name : docker-cluster
cluster_uuid : bvhUZ6w4T2Wv-eDmKDDmBQ
version :
	number : 9.1.5
	build_flavor : default
	build_type : docker
	build_hash : 90ee222e7e0136dd8ddbb34015538f3a00c129b7
	build_date : 2025-10-02T22:07:12.966975992Z
	build_snapshot : False
	lucene_version : 10.2.2
	minimum_wire_compatibility_version : 8.19.0
	minimum_index_compatibility_version : 8.0.0
tagline : You Know, for Search


Test that we can create an index.

In [192]:
response = None
try:
    response = client.indices.create(index="my-index")
    print(response)
except Exception as e:
    print(e)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-index'}


In [193]:
response=client.indices.delete(index="my-index", ignore_unavailable=True)
print(response)

{'acknowledged': True}


Let's create restaurants index

In [194]:
response=client.indices.delete(index="restaurants_index", ignore_unavailable=True)
print(response)

{'acknowledged': True}


In [201]:
columns = ["Field Name","Field Data Type","Notes"]
data = [ {"Field Name":"rid","Field Data Type":"integer","Notes":""},
         {"Field Name":"name","Field Data Type":"text","Notes":"searchable text content, search by name"},
         {"Field Name":"cuisine","Field Data Type":"text","Notes":"searchable text content, search by cuisine"},
         {"Field Name":"location","Field Data Type":"geo_point","Notes":"latitude/longitude coordinates, search by nearby"},
         {"Field Name":"description","Field Data Type":"text","Notes":"searchable text content"},
         ]
df = pd.DataFrame(data=data,columns = columns)
df

Unnamed: 0,Field Name,Field Data Type,Notes
0,rid,integer,
1,name,text,"searchable text content, search by name"
2,cuisine,text,"searchable text content, search by cuisine"
3,location,geo_point,"latitude/longitude coordinates, search by nearby"
4,description,text,searchable text content


Mapping specifies what an application will store in the index. Restaurant Identifier (restaurant_id) comes from restaurants table.

In [56]:
restaurants_mappings = {
 "properties": {
     "rid" : {
         "type": "integer",
        },
      "name": {
        "type": "text",      ## searchable text content, search by name
      },
      "cuisine": {
        "type": "text",      ## searchable text content, search by cuisine
      },
      "location": {
        "type": "geo_point",  ## latitude/longitude coordinates, search by nearby
      },
      "description": {
        "type": "text",      ## searchable text content
      },
    },
}

Let's create an index by using the mappings.

In [57]:
# Create the index
client.indices.create(index="restaurants_index", mappings=restaurants_mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'restaurants_index'})

In [60]:
response = client.indices.get(index="restaurants_index")
print(response)

{'restaurants_index': {'aliases': {}, 'mappings': {'properties': {'cuisine': {'type': 'text'}, 'description': {'type': 'text'}, 'location': {'type': 'geo_point'}, 'name': {'type': 'text'}, 'rid': {'type': 'integer'}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'restaurants_index', 'creation_date': '1760718853447', 'number_of_replicas': '1', 'uuid': 'KftyA8LvQ4y2HfxagBj6WA', 'version': {'created': '9033000'}}}}}


Let's load restaurants data from CSV file into Pandas Data Frame.

In [203]:
filename="./restaurants.csv"
df = pd.read_csv(filename)
df[:10]

Unnamed: 0,Name,Lon,Lat
0,Morris Park Bake Shop,-73.856077,40.848447
1,Wendy'S,-73.961704,40.662942
2,Riviera Caterer,-73.98242,40.579505
3,Tov Kosher Kitchen,-73.860115,40.731174
4,Brunos On The Boulevard,-73.880383,40.764312
5,Dj Reynolds Pub And Restaurant,-73.985136,40.767692
6,Wilken'S Fine Food,-73.906851,40.619903
7,Regina Caterers,-74.005289,40.628886
8,Taste The Tropics Ice Cream,-73.948261,40.640827
9,Kosher Island,-74.137729,40.611957


Each row contains the name of restaurant and its location. Let's add Rid column for this exercise.

In [209]:
df['Rid'] = df.index
printdf = df[:10]
print(printdf.to_string(index=False))


                          Name        Lon       Lat  Rid
         Morris Park Bake Shop -73.856077 40.848447    0
                       Wendy'S -73.961704 40.662942    1
               Riviera Caterer -73.982420 40.579505    2
            Tov Kosher Kitchen -73.860115 40.731174    3
       Brunos On The Boulevard -73.880383 40.764312    4
Dj Reynolds Pub And Restaurant -73.985136 40.767692    5
            Wilken'S Fine Food -73.906851 40.619903    6
               Regina Caterers -74.005289 40.628886    7
   Taste The Tropics Ice Cream -73.948261 40.640827    8
                 Kosher Island -74.137729 40.611957    9


Let's prepare a document representing a restaurant records in Elasticsearch.
<br> location follows geo_point syntax. See [Geo_Point specification](https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/geo-point) for details.

In [210]:
rst = {
    "rid": int(df.at[0,"Rid"]),
    "name": df.at[0,"Name"],
    "cuisine": "italian",
    "location": {
        "lat": float("%f" % df.at[0,"Lat"]),
        "lon": float("%f" % df.at[0,"Lon"]),
    },
    "description": "searchable text content",
}
rst

{'rid': 0,
 'name': 'Morris Park Bake Shop',
 'cuisine': 'italian',
 'location': {'lat': 40.848447, 'lon': -73.856077},
 'description': 'searchable text content'}

In [211]:
response = client.index(index="restaurants_index", body=rst)
print(response)

{'_index': 'restaurants_index', '_id': 'JIVI9JkBAy2C4wOUwVJY', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


Let's query what we have added/indexed.
<br> The first restaurant record's rid is 0, therefore, it will look for entries having 0 as rid value.

In [212]:
query = {
    'query': {
        'match': {
            'rid': 0
        }
    },
    'size': 10
}
response = client.search(index="restaurants_index", body=query)
print(response)

{'took': 20, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'restaurants_index', '_id': 'JIVI9JkBAy2C4wOUwVJY', '_score': 1.0, '_source': {'rid': 0, 'name': 'Morris Park Bake Shop', 'cuisine': 'italian', 'location': {'lat': 40.848447, 'lon': -73.856077}, 'description': 'searchable text content'}}]}}


Response has metadata and matching documents (in _source).

In [213]:
for r in response['hits']['hits']:
    print(f"_index : {r['_index']}")
    print(f"_id    : {r['_id']}")
    print(f"_score : {r['_score']}")
    print( "source : ")
    for k in r['_source'].keys():
        print(f"\t{k} : {r['_source'][k]}")

_index : restaurants_index
_id    : JIVI9JkBAy2C4wOUwVJY
_score : 1.0
source : 
	rid : 0
	name : Morris Park Bake Shop
	cuisine : italian
	location : {'lat': 40.848447, 'lon': -73.856077}
	description : searchable text content


Add all data in dataframe to index in Elasticsearch.

In [214]:
df.iloc[-1]

Name    Capital Grille
Lon         -73.974723
Lat          40.751244
Rid               4999
Name: 4999, dtype: object

In [215]:
df[-3:]

Unnamed: 0,Name,Lon,Lat,Rid
4997,Ellen Deli & Grocery,-74.00781,40.725708,4997
4998,Crepes On Columbus,-73.961831,40.801052,4998
4999,Capital Grille,-73.974723,40.751244,4999


Let's add cuisine field randomly.
<br>Sampled from [List Of Cuisines](https://en.wikipedia.org/wiki/List_of_cuisines)

In [216]:
import random

cuisine = ['italian', 'chinese', 'french', 'zambian', 'egyptian', 'canadian', 'mexican', 'vietnamese', 'cajun', 'korean', 'thai', 'brazilian','colombian','peruvian','ecuadorian', 'japanese','indian','malaysian','russian', 'indonesian']

for i in range(df.shape[0]):
    df.at[i,'Cuisine'] = cuisine[random.randint(0,len(cuisine)-1)]
df[-4:]

Unnamed: 0,Name,Lon,Lat,Rid,Cuisine
4996,Wagner College - Hawk' Nest,-74.092853,40.615121,4996,zambian
4997,Ellen Deli & Grocery,-74.00781,40.725708,4997,italian
4998,Crepes On Columbus,-73.961831,40.801052,4998,french
4999,Capital Grille,-73.974723,40.751244,4999,peruvian


In [101]:
def restaurants_doc_builder(r):
    rst = {
        "rid": int(r["rid"]),
        "name": r["name"],
    }
    return rst

def restaurants_index_add(r):
    doc = {
        "rid": int(r["Rid"]),
        "name": r["Name"],
        "cuisine": r["Cuisine"],
        "location": {
            "lat": float("%f" % r["Lat"]),
            "lon": float("%f" % r["Lon"]),
        },
        "description": "searchable text content",
    }
    response = client.index(index="restaurants_index", body=doc)
    return response['result']



In [103]:
df3 = df[1:10]
df3

Unnamed: 0,Name,Lon,Lat,Rid,Cuisine
1,Wendy'S,-73.961704,40.662942,1,vietnamese
2,Riviera Caterer,-73.98242,40.579505,2,korean
3,Tov Kosher Kitchen,-73.860115,40.731174,3,indian
4,Brunos On The Boulevard,-73.880383,40.764312,4,russian
5,Dj Reynolds Pub And Restaurant,-73.985136,40.767692,5,indian
6,Wilken'S Fine Food,-73.906851,40.619903,6,chinese
7,Regina Caterers,-74.005289,40.628886,7,malaysian
8,Taste The Tropics Ice Cream,-73.948261,40.640827,8,malaysian
9,Kosher Island,-74.137729,40.611957,9,russian


In [104]:

df3['Result'] = df3.apply(restaurants_index_add, axis=1)
df3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['Result'] = df3.apply(restaurants_index_add, axis=1)


Unnamed: 0,Name,Lon,Lat,Rid,Cuisine,Result
1,Wendy'S,-73.961704,40.662942,1,vietnamese,created
2,Riviera Caterer,-73.98242,40.579505,2,korean,created
3,Tov Kosher Kitchen,-73.860115,40.731174,3,indian,created
4,Brunos On The Boulevard,-73.880383,40.764312,4,russian,created
5,Dj Reynolds Pub And Restaurant,-73.985136,40.767692,5,indian,created
6,Wilken'S Fine Food,-73.906851,40.619903,6,chinese,created
7,Regina Caterers,-74.005289,40.628886,7,malaysian,created
8,Taste The Tropics Ice Cream,-73.948261,40.640827,8,malaysian,created
9,Kosher Island,-74.137729,40.611957,9,russian,created


In [144]:
# response['hits']['hits']
def print_es_response_hits_hits(response):

    for r in response:
        print("========================")
        print(f"_index : {r['_index']}")
        print(f"_id    : {r['_id']}")
        print(f"_score : {r['_score']}")
        print("source : ")
        for k in r['_source'].keys():
            print(f"\t{k} : {r['_source'][k]}")

In [150]:
# Range Query
query = {
    'query': {
        'range': {
            'rid': {
                'gte': 0,
                'lte': 10
            }
        }
    },
    'size': 10
}
response = client.search(index="restaurants_index", body=query)
print(response)

{'took': 8, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'restaurants_index', '_id': 'GoUM85kBAy2C4wOUm1IA', '_score': 1.0, '_source': {'rid': 0, 'name': 'Morris Park Bake Shop', 'cuisine': 'italian', 'location': {'lat': 40.848447, 'lon': -73.856077}, 'description': 'searchable text content'}}, {'_index': 'restaurants_index', '_id': 'G4VQ85kBAy2C4wOUU1IZ', '_score': 1.0, '_source': {'rid': 1, 'name': "Wendy'S", 'cuisine': 'vietnamese', 'location': {'lat': 40.662942, 'lon': -73.961704}, 'description': 'searchable text content'}}, {'_index': 'restaurants_index', '_id': 'HIVQ85kBAy2C4wOUU1Jn', '_score': 1.0, '_source': {'rid': 2, 'name': 'Riviera Caterer', 'cuisine': 'korean', 'location': {'lat': 40.579505, 'lon': -73.98242}, 'description': 'searchable text content'}}, {'_index': 'restaurants_index', '_id': 'HYVQ85kBAy2C4wOUU1Kr', '_score': 1.0, '_source': {

In [151]:
print_es_response_hits_hits(response['hits']['hits'])

_index : restaurants_index
_id    : GoUM85kBAy2C4wOUm1IA
_score : 1.0
source : 
	rid : 0
	name : Morris Park Bake Shop
	cuisine : italian
	location : {'lat': 40.848447, 'lon': -73.856077}
	description : searchable text content
_index : restaurants_index
_id    : G4VQ85kBAy2C4wOUU1IZ
_score : 1.0
source : 
	rid : 1
	name : Wendy'S
	cuisine : vietnamese
	location : {'lat': 40.662942, 'lon': -73.961704}
	description : searchable text content
_index : restaurants_index
_id    : HIVQ85kBAy2C4wOUU1Jn
_score : 1.0
source : 
	rid : 2
	name : Riviera Caterer
	cuisine : korean
	location : {'lat': 40.579505, 'lon': -73.98242}
	description : searchable text content
_index : restaurants_index
_id    : HYVQ85kBAy2C4wOUU1Kr
_score : 1.0
source : 
	rid : 3
	name : Tov Kosher Kitchen
	cuisine : indian
	location : {'lat': 40.731174, 'lon': -73.860115}
	description : searchable text content
_index : restaurants_index
_id    : HoVQ85kBAy2C4wOUU1Ls
_score : 1.0
source : 
	rid : 4
	name : Brunos On The Boul

Run NNQ

In [155]:
NNQ_LON=-73.961704
NNQ_LAT= 40.662942
NNQ_RADIUS = 160

query = {
    'query': {
        'bool': {
            'filter': {
                'geo_distance': {
                    "distance": "5km",
                    "location": {
                    "lat": float(f"{NNQ_LAT}"),
                    "lon": float(f"{NNQ_LON}"),
                    }
                }
            }
        }
    },
    "sort":[ {"_score": {"order": "desc"}}],
    'size': 10
}
print(query)
print(json.dumps(query, indent=2))


{'query': {'bool': {'filter': {'geo_distance': {'distance': '5km', 'location': {'lat': 40.662942, 'lon': -73.961704}}}}}, 'sort': [{'_score': {'order': 'desc'}}], 'size': 10}
{
  "query": {
    "bool": {
      "filter": {
        "geo_distance": {
          "distance": "5km",
          "location": {
            "lat": 40.662942,
            "lon": -73.961704
          }
        }
      }
    }
  },
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    }
  ],
  "size": 10
}


In [148]:
response = client.search(index="restaurants_index", body=query)
print(response)

{'took': 9, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 0.0, 'hits': [{'_index': 'restaurants_index', '_id': 'G4VQ85kBAy2C4wOUU1IZ', '_score': 0.0, '_source': {'rid': 1, 'name': "Wendy'S", 'cuisine': 'vietnamese', 'location': {'lat': 40.662942, 'lon': -73.961704}, 'description': 'searchable text content'}}, {'_index': 'restaurants_index', '_id': 'IoVQ85kBAy2C4wOUVFLd', '_score': 0.0, '_source': {'rid': 8, 'name': 'Taste The Tropics Ice Cream', 'cuisine': 'malaysian', 'location': {'lat': 40.640827, 'lon': -73.948261}, 'description': 'searchable text content'}}]}}


In [157]:
print(json.dumps(dict(response), indent=2))

{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 10,
      "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
      {
        "_index": "restaurants_index",
        "_id": "GoUM85kBAy2C4wOUm1IA",
        "_score": 1.0,
        "_source": {
          "rid": 0,
          "name": "Morris Park Bake Shop",
          "cuisine": "italian",
          "location": {
            "lat": 40.848447,
            "lon": -73.856077
          },
          "description": "searchable text content"
        }
      },
      {
        "_index": "restaurants_index",
        "_id": "G4VQ85kBAy2C4wOUU1IZ",
        "_score": 1.0,
        "_source": {
          "rid": 1,
          "name": "Wendy'S",
          "cuisine": "vietnamese",
          "location": {
            "lat": 40.662942,
            "lon": -73.961704
          },
          "description": "searchable text content"
      

In [158]:
print_es_response_hits_hits(response['hits']['hits'])

_index : restaurants_index
_id    : GoUM85kBAy2C4wOUm1IA
_score : 1.0
source : 
	rid : 0
	name : Morris Park Bake Shop
	cuisine : italian
	location : {'lat': 40.848447, 'lon': -73.856077}
	description : searchable text content
_index : restaurants_index
_id    : G4VQ85kBAy2C4wOUU1IZ
_score : 1.0
source : 
	rid : 1
	name : Wendy'S
	cuisine : vietnamese
	location : {'lat': 40.662942, 'lon': -73.961704}
	description : searchable text content
_index : restaurants_index
_id    : HIVQ85kBAy2C4wOUU1Jn
_score : 1.0
source : 
	rid : 2
	name : Riviera Caterer
	cuisine : korean
	location : {'lat': 40.579505, 'lon': -73.98242}
	description : searchable text content
_index : restaurants_index
_id    : HYVQ85kBAy2C4wOUU1Kr
_score : 1.0
source : 
	rid : 3
	name : Tov Kosher Kitchen
	cuisine : indian
	location : {'lat': 40.731174, 'lon': -73.860115}
	description : searchable text content
_index : restaurants_index
_id    : HoVQ85kBAy2C4wOUU1Ls
_score : 1.0
source : 
	rid : 4
	name : Brunos On The Boul

In [165]:
# Helper
def get_data_frame(r_src, r_score):
    my_row = {
        'rid': r_src['rid'],
        'name': r_src['name'] ,
        'cuisine': r_src['cuisine'],
        'location_lat': r_src['location']['lat'],
        'location_lon': r_src['location']['lon'],
        'description': r_src['description'],
        '_score': r_score,
    }
    return dict(my_row)


columns=['rid', 'name', 'cuisine', 'location_lat', 'location_lon', 'description']
df_qresponse = pd.DataFrame(columns=columns)
for r in response['hits']['hits']:
    new_row = pd.DataFrame.from_dict(data=[get_data_frame(r['_source'],r['_score'])], orient='columns')
    df_qresponse = pd.concat([df_qresponse, new_row], ignore_index=True)

df_qresponse

  df_qresponse = pd.concat([df_qresponse, new_row], ignore_index=True)


Unnamed: 0,rid,name,cuisine,location_lat,location_lon,description,_score
0,0,Morris Park Bake Shop,italian,40.848447,-73.856077,searchable text content,1.0
1,1,Wendy'S,vietnamese,40.662942,-73.961704,searchable text content,1.0
2,2,Riviera Caterer,korean,40.579505,-73.98242,searchable text content,1.0
3,3,Tov Kosher Kitchen,indian,40.731174,-73.860115,searchable text content,1.0
4,4,Brunos On The Boulevard,russian,40.764312,-73.880383,searchable text content,1.0
5,5,Dj Reynolds Pub And Restaurant,indian,40.767692,-73.985136,searchable text content,1.0
6,6,Wilken'S Fine Food,chinese,40.619903,-73.906851,searchable text content,1.0
7,7,Regina Caterers,malaysian,40.628886,-74.005289,searchable text content,1.0
8,8,Taste The Tropics Ice Cream,malaysian,40.640827,-73.948261,searchable text content,1.0
9,9,Kosher Island,russian,40.611957,-74.137729,searchable text content,1.0


In [173]:
query = {
    'query': {
        'bool': {
            "must": [
                {"match": {"cuisine": "malaysian"}},
                {"geo_distance": {
                    "distance": "5km",
                    "location": {
                        "lat": float(f"{NNQ_LAT}"),
                        "lon": float(f"{NNQ_LON}"),
                    }
                }
                },
            ]
        }
    },
    "sort": [{"_score": {"order": "desc"}}],
    'size': 10
}

print(json.dumps(dict(query), indent=2))

{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "cuisine": "malaysian"
          }
        },
        {
          "geo_distance": {
            "distance": "5km",
            "location": {
              "lat": 40.662942,
              "lon": -73.961704
            }
          }
        }
      ]
    }
  },
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    }
  ],
  "size": 10
}


In [175]:
response = client.search(index="restaurants_index", body=query)
print(response)

{'took': 7, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 2.4816046, 'hits': [{'_index': 'restaurants_index', '_id': 'IoVQ85kBAy2C4wOUVFLd', '_score': 2.4816046, '_source': {'rid': 8, 'name': 'Taste The Tropics Ice Cream', 'cuisine': 'malaysian', 'location': {'lat': 40.640827, 'lon': -73.948261}, 'description': 'searchable text content'}}]}}


In [145]:
print_es_response_hits_hits(response['hits']['hits'])

_index : restaurants_index
_id    : IoVQ85kBAy2C4wOUVFLd
_score : 2.4816046
source : 
	rid : 8
	name : Taste The Tropics Ice Cream
	cuisine : malaysian
	location : {'lat': 40.640827, 'lon': -73.948261}
	description : searchable text content


## Summary

## Appendix

### Bulk Index

In [None]:

operation = []

client.bulk(operation)

### DSL

https://www.elastic.co/docs/reference/elasticsearch/clients/python/configuration
https://www.elastic.co/docs/reference/elasticsearch/mapping-reference/geo-point