In [1]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import calendar

import pandas as pd 

dotenv_path = Path('/root/workspace/.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [2]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('/root/workspace/data/meta/lsc23.csv')

  df = pd.read_csv('/root/workspace/data/meta/lsc23.csv')


In [3]:
# fill nan description with empty string
df = df.fillna('')
df.describe()

Unnamed: 0,new_lat,new_lng,movement_prob
count,723329.0,723329.0,723329.0
mean,50.232795,2.246,0.86908
std,10.934981,28.858598,0.16918
min,-37.824879,-75.702031,0.0
25%,53.38544,-6.257253,0.79252
50%,53.386041,-6.174619,0.956895
75%,53.389969,-6.145784,0.991074
max,60.390785,144.953833,0.999939


In [4]:
df.head()

Unnamed: 0,minute_id,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,city,country,new_timezone,index,ImageID,Tags,OCR,Caption,CaptionScore
0,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103717_000,20190101_103717_000.jpg,indoor,,a window with a curtain,0.367844
1,20190101_1037,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103749_000,20190101_103749_000.jpg,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621
2,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103821_000,20190101_103821_000.jpg,"building,window",,a view of a building from a window,0.349112
3,20190101_1038,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103853_000,20190101_103853_000.jpg,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968
4,20190101_1039,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,"Dublin, Ireland, Leinster",Ireland,Europe/Dublin,20190101_103925_000,20190101_103925_000.jpg,"indoor,wall",,a black rectangular object with a white object...,0.33464


In [5]:
df.duplicated().sum()

0

In [6]:
# drop if index starts with 2000 
df = df[~df['index'].str.startswith('2000')]

In [7]:
df.semantic_name.describe()

count                           723329
unique                             734
top       Dublin City University (DCU)
freq                            226825
Name: semantic_name, dtype: object

In [8]:
df.new_lat.isna().sum()

0

In [9]:
df.semantic_name.value_counts()

semantic_name
Dublin City University (DCU)              226825
HOME                                      193201
Car                                        93468
Charm Hand & Foot Spa                      29974
Airplane                                   16117
                                           ...  
Prada                                          1
Bonjour Resto - Beefsteak Hai Ba Trung         1
Boomerang Nightclub                            1
Windsor Avenue                                 1
Glendalough Upper Lake Car Park                1
Name: count, Length: 734, dtype: int64

In [10]:
df.semantic_name.unique()

array(['HOME', 'Car', 'Charm Hand & Foot Spa', 'TOP Oil',
       'Brown Thomas Car Park', 'Brown Thomas', 'Walking Outside',
       'Dubray Books', "Stephen's Green Shopping Centre",
       'Broadridge Ireland', "Eddie Rocket's",
       'Killashee House Hotel & Villa Spa', 'Kildare Village',
       'Watch Station International Kildare', "O'Connors",
       'Naas Retail Park', 'Conlans BMW - Kildare', 'GreenIT', 'B&Q Naas',
       'Kingfisher Indian Restaurant', 'Man Importers Ireland',
       'SuperValu', 'Dublin City University (DCU)', 'Get Fresh',
       'Public Transport', 'Donaghmede Shopping Centre',
       'DCU School of Computing', 'Dublin Airport (DUB)', 'Airplane',
       'Frankfurt Airport (FRA) (Frankfurt Airport)',
       'Thessaloniki International Airport Macedonia', 'Gregalef Loft',
       'Eat Skaste', 'MD Medical Biometric', 'Mediterranean Palace Hotel',
       'Baron Hirsch Hall', 'The Grocery Store of Thessaloniki',
       'Mallioras Kontosouvli', 'Cycling', 'Zeus Co

In [11]:
import pytz
from pysearch.utils.time import hour2part_of_day

In [12]:
def minute_id_to_datetime(minute_id):
    # 20190101_1037 
    year = int(minute_id[:4])
    month = int(minute_id[4:6])
    day = int(minute_id[6:8])
    hour = int(minute_id[9:11])
    minute = int(minute_id[11:13])
    return datetime(year, month, day, hour, minute)


def datetime_to_ymd(datetime):
    return datetime.strftime("%Y%m%d")

def datetime_to_weekday(datetime):
    return calendar.day_name[datetime.weekday()]

def datetime_to_year(datetime):
    return str(datetime.year)
def datetime_to_month(datetime):
    return str(datetime.month)

def datetime_to_localtime(datetime, tz='Europe/Dublin'):
    if tz is None or tz == '':
        return None
    hour = datetime.astimezone(pytz.timezone(tz)).hour
    return hour

datetime_to_weekday(datetime.now())

'Monday'

In [13]:
df['timestamp'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_ymd)
df['local_time'] = df.apply(lambda x: datetime_to_localtime(minute_id_to_datetime(x['minute_id']), tz=x['new_timezone']), axis=1)
df['weekday'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_weekday)
df['year'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_year)
df['month'] = df['minute_id'].apply(minute_id_to_datetime).apply(datetime_to_month)

In [14]:
df['local_time'].isna().sum()

54

In [15]:
# fill nan local_time with previous value
df['local_time'] = df['local_time'].fillna(method='ffill')
df['semantic_time'] = df['local_time'].apply(hour2part_of_day)

In [16]:
df.drop(columns=['minute_id'], inplace=True)
df.head()

Unnamed: 0,stop,new_lat,new_lng,semantic_name,foursquare_id,original_name,categories,parent,movement,movement_prob,...,Tags,OCR,Caption,CaptionScore,timestamp,local_time,weekday,year,month,semantic_time
0,True,53.38998,-6.14576,HOME,,,,,Inside,0.981373,...,indoor,,a window with a curtain,0.367844,20190101,10.0,Tuesday,2019,1,morning
1,True,53.38998,-6.14576,HOME,,,,,Inside,0.997463,...,"wall,indoor,room,furniture",,a room with a shelf and a mirror,0.277621,20190101,10.0,Tuesday,2019,1,morning
2,True,53.38998,-6.14576,HOME,,,,,Inside,0.875969,...,"building,window",,a view of a building from a window,0.349112,20190101,10.0,Tuesday,2019,1,morning
3,True,53.38998,-6.14576,HOME,,,,,Inside,0.998937,...,"indoor,wall,person,counter,cluttered",,a person in a kitchen,0.491968,20190101,10.0,Tuesday,2019,1,morning
4,True,53.38998,-6.14576,HOME,,,,,Inside,0.756499,...,"indoor,wall",,a black rectangular object with a white object...,0.33464,20190101,10.0,Tuesday,2019,1,morning


## Create index in Elasticsearch

In [2]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [21]:
from pprint import pprint
pprint(config)

{'CACHE_DIR': '.cache/',
 'DIMENSION': 2,
 'HOST': '0.0.0.0',
 'INDEX': None,
 'PASSWORD': '123456',
 'PORT': '20542',
 'RETURN_SIZE': 10,
 'USERNAME': 'elastic'}


In [5]:
config['INDEX'] = 'lsc23_full_db'
proc = ElasticProcessor(config, max_result_window=1000000)

Connected to Elasticsearch node


In [4]:
proc.available_indices()

dict_keys(['lsc23_db', 'lsc23_full_db'])

In [24]:
proc.info()

{'properties': {'Caption': {'type': 'text'},
  'CaptionScore': {'type': 'text'},
  'ImageID': {'type': 'text'},
  'OCR': {'type': 'text'},
  'Tags': {'type': 'text'},
  'categories': {'type': 'text'},
  'city': {'type': 'text'},
  'country': {'type': 'text'},
  'foursquare_id': {'type': 'text'},
  'index': {'type': 'text'},
  'local_time': {'type': 'float'},
  'month': {'type': 'text'},
  'movement': {'type': 'text'},
  'movement_prob': {'type': 'float'},
  'new_lat': {'type': 'float'},
  'new_lng': {'type': 'float'},
  'new_timezone': {'type': 'text'},
  'original_name': {'type': 'text'},
  'parent': {'type': 'text'},
  'semantic_name': {'type': 'text'},
  'semantic_time': {'type': 'text'},
  'stop': {'type': 'boolean'},
  'timestamp': {'type': 'date', 'format': 'basic_date'},
  'weekday': {'type': 'text'},
  'year': {'type': 'text'}}}

In [20]:
# proc.kill('lsc23_full_db')

In [21]:
from pandas.io.json import build_table_schema 

def df_schema_to_mapping(df_schema,timefield='date'):
    mapping = {}
    for col in df_schema['fields']:
        col_name = col['name']
        col_type = col['type']
        if col_type == 'string':
            col_type = 'text'
        if col_type == 'number':
            col_type = 'float'
        if col_name == timefield:
            col_type = 'date'
            mapping[col_name] = {
                'type': col_type,
                'format': 'basic_date'
            }
            continue
        mapping[col_name] = {
            'type': col_type
        }
    return mapping

def export_mapping(df):
    df_schema = build_table_schema(df)
    mapping = df_schema_to_mapping(df_schema, timefield='timestamp')
    mapping = {
        "mappings": {
            "properties": mapping
        }
    }
    return mapping

mapping = export_mapping(df)
mapping

{'mappings': {'properties': {'index': {'type': 'text'},
   'stop': {'type': 'boolean'},
   'new_lat': {'type': 'float'},
   'new_lng': {'type': 'float'},
   'semantic_name': {'type': 'text'},
   'foursquare_id': {'type': 'text'},
   'original_name': {'type': 'text'},
   'categories': {'type': 'text'},
   'parent': {'type': 'text'},
   'movement': {'type': 'text'},
   'movement_prob': {'type': 'float'},
   'city': {'type': 'text'},
   'country': {'type': 'text'},
   'new_timezone': {'type': 'text'},
   'ImageID': {'type': 'text'},
   'Tags': {'type': 'text'},
   'OCR': {'type': 'text'},
   'Caption': {'type': 'text'},
   'CaptionScore': {'type': 'text'},
   'timestamp': {'type': 'date', 'format': 'basic_date'},
   'local_time': {'type': 'float'},
   'weekday': {'type': 'text'},
   'year': {'type': 'text'},
   'month': {'type': 'text'},
   'semantic_time': {'type': 'text'}}}}

In [22]:
proc.index_dataframe(df, mapping)

  0%|          | 0/723329 [00:00<?, ?it/s]

100%|██████████| 723329/723329 [01:36<00:00, 7499.59it/s]


In [23]:
from pprint import pprint 
pprint(proc.info())

{'properties': {'Caption': {'type': 'text'},
                'CaptionScore': {'type': 'text'},
                'ImageID': {'type': 'text'},
                'OCR': {'type': 'text'},
                'Tags': {'type': 'text'},
                'categories': {'type': 'text'},
                'city': {'type': 'text'},
                'country': {'type': 'text'},
                'foursquare_id': {'type': 'text'},
                'index': {'type': 'text'},
                'local_time': {'type': 'float'},
                'month': {'type': 'text'},
                'movement': {'type': 'text'},
                'movement_prob': {'type': 'float'},
                'new_lat': {'type': 'float'},
                'new_lng': {'type': 'float'},
                'new_timezone': {'type': 'text'},
                'original_name': {'type': 'text'},
                'parent': {'type': 'text'},
                'semantic_name': {'type': 'text'},
                'semantic_time': {'type': 'text'},
                'st

In [13]:
date = nlp2datetime('12-13-2019')
date

datetime.datetime(2019, 12, 13, 9, 44, 49)

In [33]:
def check_duplicate_response(responses):
    hits_name = [hit['_id'] for hit in responses]
    assert len(hits_name) == len(set(hits_name)), "Duplicate response"

In [14]:
proc.get_document_by_id(['20200509_133031_000'])

Function get_document_by_id elapsed time: 0:00:00.031337


[{'_index': 'lsc23_full_db',
  '_id': '20200509_133031_000',
  '_score': 2.0,
  '_source': {'stop': False,
   'new_lat': 53.76520193,
   'new_lng': -8.39711666,
   'semantic_name': 'Car',
   'foursquare_id': '',
   'original_name': '',
   'categories': '',
   'parent': '',
   'movement': 'Inside',
   'movement_prob': 0.8909477591514587,
   'city': 'Roscommon, Ireland, Roscommon Municipal District',
   'country': 'Ireland',
   'new_timezone': 'Europe/Dublin',
   'ImageID': '20200509_133031_000.jpg',
   'Tags': 'tree,outdoor,plant',
   'OCR': '',
   'Caption': 'a house with trees around it',
   'CaptionScore': 0.3369276523590088,
   'timestamp': '20200509',
   'local_time': 14.0,
   'weekday': 'Saturday',
   'year': '2020',
   'month': '5',
   'semantic_time': 'early afternoon'}}]

In [37]:
check_duplicate_response(proc.compose_pipeline({'text': {'fields': ['month'], 'must': '9', 'should': None }}))

Function run elapsed time: 0:00:00.000005


In [34]:
check_duplicate_response(proc.search_text_closestday_pipeline('indoor', [], timefield='timestamp', timestamp=date, filter=None))

Function run elapsed time: 0:00:00.000009


In [12]:
proc.search_text_closestday_pipeline('indoor', [], timefield='timestamp', timestamp=date, filter=None, topk=5)

NameError: name 'date' is not defined

In [36]:
check_duplicate_response(proc.compose_pipeline({'text': {'fields': ['year'], 'must': '2019', 'should': None }}))

Function run elapsed time: 0:00:00.000004


In [27]:
ELASTIC_PORT

'20542'

In [30]:


# es = Elasticsearch([f'http://0.0.0.0:{self.port}'],
#                 timeout=100, \
#                 connection_class=RequestsHttpConnection, 
#                 http_auth=(self.username, self.password), 
#                 use_ssl=False, 
#                 verify_certs=False)

# convert this to curl command

# change max_result_window to 1000000
!curl -XPUT 'http://0.0.0.0:20542/_all/_settings?preserve_existing=true' -d '{"index.max_result_window" : "1000" }'

{"error":"Content-Type header [application/x-www-form-urlencoded] is not supported","status":406}

In [6]:
len(proc.compose_pipeline({'text': {'fields': ['year'], 'must': '2019', 'should': None }}, topk=10001))

Function run elapsed time: 0:00:00.000005


10001

In [28]:
proc.compose_pipeline({'text': {'fields': ['month'], 'must': '9', 'should': None }})

Function run elapsed time: 0:00:00.000005


[{'_index': 'lsc23_full_db',
  '_id': '20190901_083352_000',
  '_score': 2.6428761,
  '_source': {'stop': True,
   'new_lat': 53.3899717,
   'new_lng': -6.145808,
   'semantic_name': 'HOME',
   'foursquare_id': '',
   'original_name': '',
   'categories': '',
   'parent': '',
   'movement': 'Inside',
   'movement_prob': 0.9903006553649902,
   'city': 'Dublin, Ireland, Leinster',
   'country': 'Ireland',
   'new_timezone': 'Europe/Dublin',
   'ImageID': '20190901_083352_000.jpg',
   'Tags': 'indoor,wall,bed,bedroom',
   'OCR': '',
   'Caption': 'a bed with a white pillow',
   'CaptionScore': 0.3492282032966614,
   'timestamp': '20190901',
   'local_time': 9.0,
   'weekday': 'Sunday',
   'year': '2019',
   'month': '9',
   'semantic_time': 'morning'}},
 {'_index': 'lsc23_full_db',
  '_id': '20190901_083426_000',
  '_score': 2.6428761,
  '_source': {'stop': True,
   'new_lat': 53.3899717,
   'new_lng': -6.145808,
   'semantic_name': 'HOME',
   'foursquare_id': '',
   'original_name': '',


In [29]:
proc.compose_pipeline({'text': {'fields': ['semantic_name', 'OCR', 'Caption', 'country', 'Tags'], 'must': "Zeus Conference", 'should': None }})

Function run elapsed time: 0:00:00.000005


[{'_index': 'lsc23_full_db',
  '_id': '20190109_150833_000',
  '_score': 20.574127,
  '_source': {'stop': True,
   'new_lat': 40.6359851,
   'new_lng': 22.9355687,
   'semantic_name': 'Zeus Conference Rooms',
   'foursquare_id': '5bbd89fffd9d73002cb24fee',
   'original_name': 'Zeus Conference Rooms',
   'categories': 'Office',
   'parent': 'Mediterranean Palace Hotel',
   'movement': 'Inside',
   'movement_prob': 0.993350088596344,
   'city': 'Thessaloniki Regional Unit, Thessaloniki Municipal Unit, Macedonia and Thrace, Greece',
   'country': 'Greece',
   'new_timezone': 'Europe/Athens',
   'ImageID': '20190109_150833_000.jpg',
   'Tags': 'text,indoor,wall,ceiling,screen',
   'OCR': '',
   'Caption': 'a few people in a conference room',
   'CaptionScore': 0.4219984114170074,
   'timestamp': '20190109',
   'local_time': 17.0,
   'weekday': 'Wednesday',
   'year': '2019',
   'month': '1',
   'semantic_time': 'early evening'}},
 {'_index': 'lsc23_full_db',
  '_id': '20190108_121824_000',

In [30]:
proc.compose_pipeline({'text': {'fields': ['semantic_name', 'OCR', 'Caption', 'country', 'Tags'], 'must': None, 'should': "Zeus Conference" }})

Function run elapsed time: 0:00:00.000005


[{'_index': 'lsc23_full_db',
  '_id': '20190109_150833_000',
  '_score': 20.574127,
  '_source': {'stop': True,
   'new_lat': 40.6359851,
   'new_lng': 22.9355687,
   'semantic_name': 'Zeus Conference Rooms',
   'foursquare_id': '5bbd89fffd9d73002cb24fee',
   'original_name': 'Zeus Conference Rooms',
   'categories': 'Office',
   'parent': 'Mediterranean Palace Hotel',
   'movement': 'Inside',
   'movement_prob': 0.993350088596344,
   'city': 'Thessaloniki Regional Unit, Thessaloniki Municipal Unit, Macedonia and Thrace, Greece',
   'country': 'Greece',
   'new_timezone': 'Europe/Athens',
   'ImageID': '20190109_150833_000.jpg',
   'Tags': 'text,indoor,wall,ceiling,screen',
   'OCR': '',
   'Caption': 'a few people in a conference room',
   'CaptionScore': 0.4219984114170074,
   'timestamp': '20190109',
   'local_time': 17.0,
   'weekday': 'Wednesday',
   'year': '2019',
   'month': '1',
   'semantic_time': 'early evening'}},
 {'_index': 'lsc23_full_db',
  '_id': '20190108_121824_000',