In [1]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import pandas as pd 

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [2]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('/home/vbs/vbs23/FIRST-Server/search_services/examples/elasticsearch/lsc23/lsc23.csv')

  df = pd.read_csv('lsc23.csv')


In [7]:
# fill nan description with empty string
df = df.fillna('')
df.describe()

Unnamed: 0,new_lat,new_lng,movement_prob
count,723329.0,723329.0,723329.0
mean,50.232795,2.246,0.86908
std,10.934981,28.858598,0.16918
min,-37.824879,-75.702031,0.0
25%,53.38544,-6.257253,0.79252
50%,53.386041,-6.174619,0.956895
75%,53.389969,-6.145784,0.991074
max,60.390785,144.953833,0.999939


In [8]:
df.head()

Unnamed: 0,minute_id,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,city,country,new_timezone,index,ImageID,Tags,OCR,Caption,CaptionScore
0,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103717_000,20190101_103717_000.jpg,indoor,,a window with a curtain,0.367844
1,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103749_000,20190101_103749_000.jpg,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621
2,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103821_000,20190101_103821_000.jpg,"building,window",,a view of a building from a window,0.349112
3,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103853_000,20190101_103853_000.jpg,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968
4,20190101_1039,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103925_000,20190101_103925_000.jpg,"indoor,wall",,a black rectangular object with a white object...,0.33464


In [13]:
def minute_id_to_datetime(minute_id):
    # 20190101_1037 
    year = int(minute_id[:4])
    month = int(minute_id[4:6])
    day = int(minute_id[6:8])
    hour = int(minute_id[9:11])
    minute = int(minute_id[11:13])
    return datetime(year, month, day, hour, minute)

def datetime_to_ymd(datetime):
    return datetime.strftime("%Y%m%d")

df['date'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_ymd)
df.drop(columns=['minute_id'], inplace=True)
df.head()

Unnamed: 0,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,city,country,new_timezone,index,ImageID,Tags,OCR,Caption,CaptionScore,date
0,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103717_000,20190101_103717_000.jpg,indoor,,a window with a curtain,0.367844,20190101
1,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103749_000,20190101_103749_000.jpg,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621,20190101
2,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103821_000,20190101_103821_000.jpg,"building,window",,a view of a building from a window,0.349112,20190101
3,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103853_000,20190101_103853_000.jpg,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968,20190101
4,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103925_000,20190101_103925_000.jpg,"indoor,wall",,a black rectangular object with a white object...,0.33464,20190101


## Create index in Elasticsearch

In [14]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [15]:
config['INDEX'] = 'lsc23_db'
proc = ElasticProcessor(config)

Connected to Elasticsearch node


In [16]:
proc.available_indices()

dict_keys(['test'])

In [31]:
# proc.kill('vbs22_db')

In [19]:
from pandas.io.json import build_table_schema 

def df_schema_to_mapping(df_schema,timefield='date'):
    mapping = {}
    for col in df_schema['fields']:
        col_name = col['name']
        col_type = col['type']
        if col_type == 'string':
            col_type = 'text'
        if col_type == 'number':
            col_type = 'float'
        if col_name == timefield:
            col_type = 'date'
            mapping[col_name] = {
                'type': col_type,
                'format': 'basic_date'
            }
            continue
        mapping[col_name] = {
            'type': col_type
        }
    return mapping

def export_mapping(df):
    df_schema = build_table_schema(df)
    mapping = df_schema_to_mapping(df_schema)
    mapping = {
        "mappings": {
            "properties": mapping
        }
    }
    return mapping

mapping = export_mapping(df)
mapping

{'mappings': {'properties': {'index': {'type': 'text'},
   'stop': {'type': 'boolean'},
   'new_lat': {'type': 'float'},
   'new_lng': {'type': 'float'},
   'semantic_name': {'type': 'text'},
   'foursquare_id': {'type': 'text'},
   'original_name': {'type': 'text'},
   'categories': {'type': 'text'},
   'parent': {'type': 'text'},
   'movement': {'type': 'text'},
   'movement_prob': {'type': 'float'},
   'city': {'type': 'text'},
   'country': {'type': 'text'},
   'new_timezone': {'type': 'text'},
   'ImageID': {'type': 'text'},
   'Tags': {'type': 'text'},
   'OCR': {'type': 'text'},
   'Caption': {'type': 'text'},
   'CaptionScore': {'type': 'text'},
   'date': {'type': 'date', 'format': 'basic_date'}}}}

In [20]:
proc.index_dataframe(df, mapping)

100%|██████████| 723329/723329 [01:14<00:00, 9646.27it/s] 


In [21]:
from pprint import pprint 
pprint(proc.info())

{'properties': {'Caption': {'type': 'text'},
                'CaptionScore': {'type': 'text'},
                'ImageID': {'type': 'text'},
                'OCR': {'type': 'text'},
                'Tags': {'type': 'text'},
                'categories': {'type': 'text'},
                'city': {'type': 'text'},
                'country': {'type': 'text'},
                'date': {'format': 'basic_date', 'type': 'date'},
                'foursquare_id': {'type': 'text'},
                'index': {'type': 'text'},
                'movement': {'type': 'text'},
                'movement_prob': {'type': 'float'},
                'new_lat': {'type': 'float'},
                'new_lng': {'type': 'float'},
                'new_timezone': {'type': 'text'},
                'original_name': {'type': 'text'},
                'parent': {'type': 'text'},
                'semantic_name': {'type': 'text'},
                'stop': {'type': 'boolean'}}}


In [24]:
date = nlp2datetime('1-1-2019')
date

datetime.datetime(2019, 1, 1, 14, 53, 28)

In [25]:
proc.search_text_closestday_pipeline('indoor', [], timefield='date', timestamp=date, filter=None)

Function run elapsed time: 0:00:00.000007


[{'_index': 'lsc23_db',
  '_id': '20190105_094900_000',
  '_score': 3.0192258,
  '_source': {'stop': True,
   'new_lat': 53.38676118,
   'new_lng': -6.1471107,
   'semantic_name': 'Charm Hand & Foot Spa',
   'foursquare_id': '',
   'original_name': '',
   'categories': '',
   'parent': '',
   'movement': 'Inside',
   'movement_prob': 0.9900733232498168,
   'city': 'Dublin, Ireland, Leinster',
   'country': 'Ireland',
   'new_timezone': 'Europe/Dublin',
   'ImageID': '20190105_094900_000.jpg',
   'Tags': 'text,indoor,wall',
   'OCR': 'Setting,the,Heating,Mode,Mode,utten,LPe,on,andh,modes,Comort,mo,he,2,when,the,har,mode,starts,tematically,as,the,dt,R,ndm,thuder,he,s,ton,moe,tarts,Note,mr,b,or,LA,he,r,the,sre,me,thie,s,heating,stas,Aercng,he,cates,that,e,he,aer,ing,emperature,the,acreen,anly,d,y,LA,b,the,heatran,Setting,a,Peraonalised,Heating.ProscamProsK,Setting,the,Temperature,A:and,-Buttons,1.,The,temperature,setting,range,Ao,of,the,heater,is,5Ce,sc,the,comte,button,to,select,the,desi