In [24]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import pandas as pd 

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [25]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('vbs22_meta.csv')

In [26]:
# fill nan description with empty string
df['description'] = df['description'].fillna('')

In [27]:
df.head()

Unnamed: 0,id,vimeo_id,url,title,description,channel,tags,categories,upload_date,date,month,year,day_of_week,part_of_day,local_time,width,height,index
0,14693,129402440,https://vimeo.com/129402440,Airplay in Palau,,user23567282,"['Airplay', 'scuba diving', 'palau', 'underwat...","['/categories/travel', '/categories/sports']",2015-06-01 01:13:22,20150601,June,2015,Monday,night,01:13,1280,720,14693
1,12130,231182522,https://vimeo.com/231182522,"Slacklining in Varistonpuisto, Vantaa, Finland...",Beginner level slacklining in public park. Loc...,user24728254,"['vantaa', 'slacklining', 'slackline', 'finland']","['/categories/sports/outdoorsports/videos', '/...",2017-08-26 05:43:35,20170826,August,2017,Saturday,early morning,05:43,1920,1080,12130
2,14114,123250553,https://vimeo.com/123250553,Rick Braun,,caseyacaster,"['Rick Braun', 'music', 'music doc', 'jazz', '...","['/categories/music', '/categories/documentary']",2015-03-25 17:55:06,20150325,March,2015,Wednesday,early evening,17:55,1920,1080,14114
3,7848,103994354,https://vimeo.com/103994354,"Hotel Villa Pigna - Ascoli Piceno, Marche. Italy","So easy, so friendly.\nMaggiori informazioni s...",hotelvillapigna,"['Marche', 'Ascoli Piceno', 'Hotel', 'Villa Pi...","['/categories/travel', '/categories/personal',...",2014-08-21 06:30:02,20140821,August,2014,Thursday,early morning,06:30,1280,720,7848
4,9693,29606634,https://vimeo.com/29606634,Dream - 25 Years Celebration of Baizid Steel,"This video was shot in Chittagong, Bangladesh,...",augustrock,"['factory', 'mills', 'Trance', '5D Mark II', '...","['/categories/hd/canon/videos', '/categories/n...",2011-09-26 10:05:26,20110926,September,2011,Monday,morning,10:05,1280,720,9693


## Create index in Elasticsearch

In [28]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [29]:
config['INDEX'] = 'vbs22_db'
proc = ElasticProcessor(config)

Connected to Elasticsearch node


In [30]:
proc.available_indices()

dict_keys(['vbs22_db', 'test_index'])

In [31]:
# proc.kill('vbs22_db')

In [32]:
df_structure = {  
    "mappings": {
        "properties": {
            'index': {"type": "integer"}, # required for pysearch
            "date": {"type": "date", "format": "basic_date"}, # required for pysearch            
            'id': {"type": "integer"},
            'vimeo_id': {"type": "text"},
            'url': {"type": "text"},
            'title': {"type": "text"},
            'description': {"type": "text"},
            'channel': {"type": "text"},
            'tags': {"type": "text"},
            'categories': {"type": "text"},
            'upload_date': {"type": "text"},
            'year': {"type": "integer"},
            'month': {"type": "text"},
            'day_of_week': {"type": "text"},
            'part_of_day': {"type": "text"},
            'local_time': {"type": "text"},
            'width': {"type": "integer"},
            'height': {"type": "integer"}
        }
    }
}

In [33]:
df['index'] = df['id'] 
df.head()
proc.index_dataframe(df, df_structure)

100%|██████████| 17235/17235 [00:02<00:00, 7171.88it/s]


In [34]:
from pprint import pprint 
pprint(proc.info())

{'properties': {'categories': {'type': 'text'},
                'channel': {'type': 'text'},
                'date': {'format': 'basic_date', 'type': 'date'},
                'day_of_week': {'type': 'text'},
                'description': {'type': 'text'},
                'height': {'type': 'integer'},
                'id': {'type': 'integer'},
                'index': {'type': 'integer'},
                'local_time': {'type': 'text'},
                'month': {'type': 'text'},
                'part_of_day': {'type': 'text'},
                'tags': {'type': 'text'},
                'title': {'type': 'text'},
                'upload_date': {'type': 'text'},
                'url': {'type': 'text'},
                'vimeo_id': {'type': 'text'},
                'width': {'type': 'integer'},
                'year': {'type': 'integer'}}}


In [35]:
date = nlp2datetime('11/1/2015')
date

datetime.datetime(2015, 11, 1, 5, 48, 30)

In [36]:
proc.search_text_closestday_pipeline('Airplay', [], timefield='date', timestamp=date, filter=None)

Function run elapsed time: 0:00:00.000006


[{'_index': 'vbs22_db',
  '_id': '14693',
  '_score': 0.0,
  '_source': {'id': 14693,
   'vimeo_id': 129402440,
   'url': 'https://vimeo.com/129402440',
   'title': 'Airplay in Palau',
   'description': '',
   'channel': 'user23567282',
   'tags': "['Airplay', 'scuba diving', 'palau', 'underwater']",
   'categories': "['/categories/travel', '/categories/sports']",
   'upload_date': '2015-06-01 01:13:22',
   'date': 20150601,
   'month': 'June',
   'year': 2015,
   'day_of_week': 'Monday',
   'part_of_day': 'night',
   'local_time': '01:13',
   'width': 1280,
   'height': 720}},
 {'_index': 'vbs22_db',
  '_id': '9973',
  '_score': 0.0,
  '_source': {'id': 9973,
   'vimeo_id': 201407845,
   'url': 'https://vimeo.com/201407845',
   'title': 'Sydonia - â\x80\x9cKorn Tourâ\x80\x9d (Tour Video)',
   'description': 'This song has been licensed to VIMEO by DJ Central TV, Blue Pie Records and Planet Blue Pictures.\nTour video of Sydonia during their "Korn Tour"\nSydonia was an alternative rock/