In [28]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import pandas as pd 

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [29]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('vbs23_meta.csv')

In [30]:
df.head()

Unnamed: 0,id,location,camera_model,thumbnails,selected_frames,created_time,date,year,month,day_of_week,part_of_day,local_time,captions,width,height,ext,fps,index
0,1,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,20220827,2022,August,Saturday,early morning,07:48,underwater view of a coral reef.,1920,1080,mp4,30.0,Dapang_Jul2022_5_1
1,2,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,20220827,2022,August,Saturday,early morning,07:48,the reef is a bit smaller than the ones we saw.,1920,1080,mp4,30.0,Dapang_Jul2022_5_2
2,3,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,20220827,2022,August,Saturday,early morning,07:48,the reef is a bit more shallow than the beach.,1920,1080,mp4,30.0,Dapang_Jul2022_5_3
3,4,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,20220827,2022,August,Saturday,early morning,07:48,a small group of fish swim.,1920,1080,mp4,30.0,Dapang_Jul2022_5_4
4,5,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,20220827,2022,August,Saturday,early morning,07:48,a small group of fish swim around a coral reef.,1920,1080,mp4,30.0,Dapang_Jul2022_5_5


In [40]:
df['index'] = df.id 

## Create index in Elasticsearch

In [41]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [42]:
config['INDEX'] = 'vbs23_db'
proc = ElasticProcessor(config)

Connected to Elasticsearch node


In [43]:
proc.available_indices()

dict_keys(['vbs22_db', 'vbs23_db', 'test_index'])

In [44]:
# proc.kill('vbs22_db')

In [45]:
df_structure = {  
    "mappings": {
        "properties": {
            'index': {"type": "integer"}, # required for pysearch
            "date": {"type": "date", "format": "basic_date"}, # required for pysearch            
            'id': {"type": "integer"},
            'vimeo_id': {"type": "text"},
            'url': {"type": "text"},
            'title': {"type": "text"},
            'description': {"type": "text"},
            'channel': {"type": "text"},
            'tags': {"type": "text"},
            'categories': {"type": "text"},
            'upload_date': {"type": "text"},
            'year': {"type": "integer"},
            'month': {"type": "text"},
            'day_of_week': {"type": "text"},
            'part_of_day': {"type": "text"},
            'local_time': {"type": "text"},
            'width': {"type": "integer"},
            'height': {"type": "integer"}
        }
    }
}

In [46]:
df['index'] = df['id'] 
df.head()
proc.index_dataframe(df, df_structure)

100%|██████████| 43797/43797 [00:04<00:00, 10466.19it/s]


In [47]:
from pprint import pprint 
pprint(proc.info())

{'properties': {'camera_model': {'fields': {'keyword': {'ignore_above': 256,
                                                        'type': 'keyword'}},
                                 'type': 'text'},
                'captions': {'fields': {'keyword': {'ignore_above': 256,
                                                    'type': 'keyword'}},
                             'type': 'text'},
                'categories': {'type': 'text'},
                'channel': {'type': 'text'},
                'created_time': {'fields': {'keyword': {'ignore_above': 256,
                                                        'type': 'keyword'}},
                                 'type': 'text'},
                'date': {'format': 'basic_date', 'type': 'date'},
                'day_of_week': {'type': 'text'},
                'description': {'type': 'text'},
                'ext': {'fields': {'keyword': {'ignore_above': 256,
                                               'type': 'keyword'}},
       

In [50]:
date = nlp2datetime('11/1/2022')
date

datetime.datetime(2022, 11, 1, 5, 54, 10)

In [51]:
proc.search_text_closestday_pipeline('Swim Tuesday', [], timefield='date', timestamp=date, filter=None)

Function run elapsed time: 0:00:00.000006


[{'_index': 'vbs23_db',
  '_id': '212',
  '_score': 0.0,
  '_source': {'id': 212,
   'location': 'Silverstrand',
   'camera_model': 'HERO10 Black',
   'thumbnails': 'information/thumbnails/Silverstrand_Aug2022/0007_00212.jpg',
   'selected_frames': 'information/selected_frames/Silverstrand_Aug2022/0007_00212.jpg',
   'created_time': '2022:08:22 22:56:34',
   'date': 20220822,
   'year': 2022,
   'month': 'August',
   'day_of_week': 'Monday',
   'part_of_day': 'late afternoon',
   'local_time': '22:56',
   'captions': 'a small group of fish swim around a reef.',
   'width': 3840,
   'height': 2160,
   'ext': 'mp4',
   'fps': 30.0}},
 {'_index': 'vbs23_db',
  '_id': '240',
  '_score': 0.0,
  '_source': {'id': 240,
   'location': 'Silverstrand',
   'camera_model': 'HERO10 Black',
   'thumbnails': 'information/thumbnails/Silverstrand_Aug2022/0007_00240.jpg',
   'selected_frames': 'information/selected_frames/Silverstrand_Aug2022/0007_00240.jpg',
   'created_time': '2022:08:22 22:56:34',
  