In [30]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import pandas as pd 

dotenv_path = Path('/root/workspace/.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [31]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('/root/workspace/data/meta/lsc23.csv')

  df = pd.read_csv('/root/workspace/data/meta/lsc23.csv')


In [32]:
# fill nan description with empty string
df = df.fillna('')
df.describe()

Unnamed: 0,new_lat,new_lng,movement_prob
count,723329.0,723329.0,723329.0
mean,50.232795,2.246,0.86908
std,10.934981,28.858598,0.16918
min,-37.824879,-75.702031,0.0
25%,53.38544,-6.257253,0.79252
50%,53.386041,-6.174619,0.956895
75%,53.389969,-6.145784,0.991074
max,60.390785,144.953833,0.999939


In [33]:
df.head()

Unnamed: 0,minute_id,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,city,country,new_timezone,index,ImageID,Tags,OCR,Caption,CaptionScore
0,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103717_000,20190101_103717_000.jpg,indoor,,a window with a curtain,0.367844
1,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103749_000,20190101_103749_000.jpg,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621
2,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103821_000,20190101_103821_000.jpg,"building,window",,a view of a building from a window,0.349112
3,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103853_000,20190101_103853_000.jpg,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968
4,20190101_1039,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103925_000,20190101_103925_000.jpg,"indoor,wall",,a black rectangular object with a white object...,0.33464


In [34]:
df.semantic_name.describe()

count                           723329
unique                             734
top       Dublin City University (DCU)
freq                            226825
Name: semantic_name, dtype: object

In [35]:
df.new_lat.isna().sum()

0

In [36]:
df.semantic_name.value_counts()

semantic_name
Dublin City University (DCU)              226825
HOME                                      193201
Car                                        93468
Charm Hand & Foot Spa                      29974
Airplane                                   16117
                                           ...  
Prada                                          1
Bonjour Resto - Beefsteak Hai Ba Trung         1
Boomerang Nightclub                            1
Windsor Avenue                                 1
Glendalough Upper Lake Car Park                1
Name: count, Length: 734, dtype: int64

In [37]:
df.semantic_name.unique()

array(['HOME', 'Car', 'Charm Hand & Foot Spa', 'TOP Oil',
       'Brown Thomas Car Park', 'Brown Thomas', 'Walking Outside',
       'Dubray Books', "Stephen's Green Shopping Centre",
       'Broadridge Ireland', "Eddie Rocket's",
       'Killashee House Hotel & Villa Spa', 'Kildare Village',
       'Watch Station International Kildare', "O'Connors",
       'Naas Retail Park', 'Conlans BMW - Kildare', 'GreenIT', 'B&Q Naas',
       'Kingfisher Indian Restaurant', 'Man Importers Ireland',
       'SuperValu', 'Dublin City University (DCU)', 'Get Fresh',
       'Public Transport', 'Donaghmede Shopping Centre',
       'DCU School of Computing', 'Dublin Airport (DUB)', 'Airplane',
       'Frankfurt Airport (FRA) (Frankfurt Airport)',
       'Thessaloniki International Airport Macedonia', 'Gregalef Loft',
       'Eat Skaste', 'MD Medical Biometric', 'Mediterranean Palace Hotel',
       'Baron Hirsch Hall', 'The Grocery Store of Thessaloniki',
       'Mallioras Kontosouvli', 'Cycling', 'Zeus Co

In [38]:
import pytz
from pysearch.utils.time import hour2part_of_day

In [39]:
def minute_id_to_datetime(minute_id):
    # 20190101_1037 
    year = int(minute_id[:4])
    month = int(minute_id[4:6])
    day = int(minute_id[6:8])
    hour = int(minute_id[9:11])
    minute = int(minute_id[11:13])
    return datetime(year, month, day, hour, minute)


def datetime_to_ymd(datetime):
    return datetime.strftime("%Y%m%d")

def datetime_to_localtime(datetime, tz='Europe/Dublin'):
    if tz is None or tz == '':
        return None
    hour = datetime.astimezone(pytz.timezone(tz)).hour
    return hour

df['timestamp'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_ymd)
df['local_time'] = df.apply(lambda x: datetime_to_localtime(minute_id_to_datetime(x['minute_id']), tz=x['new_timezone']), axis=1)

count    723275.000000
mean         13.969749
std           4.591164
min           0.000000
25%          10.000000
50%          14.000000
75%          18.000000
max          23.000000
Name: local_time, dtype: float64

In [42]:
df['local_time'].isna().sum()

54

In [43]:
# fill nan local_time with previous value
df['local_time'] = df['local_time'].fillna(method='ffill')
df['semantic_time'] = df['local_time'].apply(hour2part_of_day)

In [44]:
df.drop(columns=['minute_id'], inplace=True)
df.head()

Unnamed: 0,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,...,new_timezone,index,ImageID,Tags,OCR,Caption,CaptionScore,timestamp,local_time,semantic_time
0,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,...,Europe/Dublin,20190101_103717_000,20190101_103717_000.jpg,indoor,,a window with a curtain,0.367844,20190101,10.0,morning
1,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,...,Europe/Dublin,20190101_103749_000,20190101_103749_000.jpg,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621,20190101,10.0,morning
2,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,...,Europe/Dublin,20190101_103821_000,20190101_103821_000.jpg,"building,window",,a view of a building from a window,0.349112,20190101,10.0,morning
3,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,...,Europe/Dublin,20190101_103853_000,20190101_103853_000.jpg,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968,20190101,10.0,morning
4,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,...,Europe/Dublin,20190101_103925_000,20190101_103925_000.jpg,"indoor,wall",,a black rectangular object with a white object...,0.33464,20190101,10.0,morning


## Create index in Elasticsearch

In [45]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [46]:
config['INDEX'] = 'lsc23_db'
proc = ElasticProcessor(config)

Connected to Elasticsearch node


In [47]:
proc.available_indices()

dict_keys(['lsc23_db'])

In [48]:
# proc.kill('vbs22_db')

In [49]:
from pandas.io.json import build_table_schema 

def df_schema_to_mapping(df_schema,timefield='date'):
    mapping = {}
    for col in df_schema['fields']:
        col_name = col['name']
        col_type = col['type']
        if col_type == 'string':
            col_type = 'text'
        if col_type == 'number':
            col_type = 'float'
        if col_name == timefield:
            col_type = 'date'
            mapping[col_name] = {
                'type': col_type,
                'format': 'basic_date'
            }
            continue
        mapping[col_name] = {
            'type': col_type
        }
    return mapping

def export_mapping(df):
    df_schema = build_table_schema(df)
    mapping = df_schema_to_mapping(df_schema, timefield='timestamp')
    mapping = {
        "mappings": {
            "properties": mapping
        }
    }
    return mapping

mapping = export_mapping(df)
mapping

{'mappings': {'properties': {'index': {'type': 'text'},
   'stop': {'type': 'boolean'},
   'new_lat': {'type': 'float'},
   'new_lng': {'type': 'float'},
   'semantic_name': {'type': 'text'},
   'foursquare_id': {'type': 'text'},
   'original_name': {'type': 'text'},
   'categories': {'type': 'text'},
   'parent': {'type': 'text'},
   'movement': {'type': 'text'},
   'movement_prob': {'type': 'float'},
   'city': {'type': 'text'},
   'country': {'type': 'text'},
   'new_timezone': {'type': 'text'},
   'ImageID': {'type': 'text'},
   'Tags': {'type': 'text'},
   'OCR': {'type': 'text'},
   'Caption': {'type': 'text'},
   'CaptionScore': {'type': 'text'},
   'timestamp': {'type': 'date', 'format': 'basic_date'},
   'local_time': {'type': 'float'},
   'semantic_time': {'type': 'text'}}}}

In [50]:
proc.index_dataframe(df, mapping)

 43%|████▎     | 314509/723329 [00:40<00:53, 7653.69it/s]

In [None]:
from pprint import pprint 
pprint(proc.info())

{'properties': {'Caption': {'type': 'text'},
                'CaptionScore': {'type': 'text'},
                'ImageID': {'type': 'text'},
                'OCR': {'type': 'text'},
                'Tags': {'type': 'text'},
                'categories': {'type': 'text'},
                'city': {'type': 'text'},
                'country': {'type': 'text'},
                'foursquare_id': {'type': 'text'},
                'index': {'type': 'text'},
                'local_time': {'type': 'integer'},
                'movement': {'type': 'text'},
                'movement_prob': {'type': 'float'},
                'new_lat': {'type': 'float'},
                'new_lng': {'type': 'float'},
                'new_timezone': {'type': 'text'},
                'original_name': {'type': 'text'},
                'parent': {'type': 'text'},
                'semantic_name': {'type': 'text'},
                'semantic_time': {'type': 'text'},
                'stop': {'type': 'boolean'},
               

In [None]:
date = nlp2datetime('1-1-2019')
date

datetime.datetime(2019, 1, 1, 22, 19)

In [None]:
proc.search_text_closestday_pipeline('indoor', [], timefield='timestamp', timestamp=date, filter=None)

Function run elapsed time: 0:00:00.000006


[{'_index': 'lsc23_db',
  '_id': '20190105_094900_000',
  '_score': 3.0192258,
  '_source': {'stop': True,
   'new_lat': 53.38676118,
   'new_lng': -6.1471107,
   'semantic_name': 'Charm Hand & Foot Spa',
   'foursquare_id': '',
   'original_name': '',
   'categories': '',
   'parent': '',
   'movement': 'Inside',
   'movement_prob': 0.9900733232498168,
   'city': 'Dublin, Ireland, Leinster',
   'country': 'Ireland',
   'new_timezone': 'Europe/Dublin',
   'ImageID': '20190105_094900_000.jpg',
   'Tags': 'text,indoor,wall',
   'OCR': 'Setting,the,Heating,Mode,Mode,utten,LPe,on,andh,modes,Comort,mo,he,2,when,the,har,mode,starts,tematically,as,the,dt,R,ndm,thuder,he,s,ton,moe,tarts,Note,mr,b,or,LA,he,r,the,sre,me,thie,s,heating,stas,Aercng,he,cates,that,e,he,aer,ing,emperature,the,acreen,anly,d,y,LA,b,the,heatran,Setting,a,Peraonalised,Heating.ProscamProsK,Setting,the,Temperature,A:and,-Buttons,1.,The,temperature,setting,range,Ao,of,the,heater,is,5Ce,sc,the,comte,button,to,select,the,desi