In [1]:
import datetime
import json
import requests

import pandas as pd
import numpy as np 

from elasticsearch import Elasticsearch

In [2]:
res = requests.get('http://localhost:9200')
res.content

b'{\n  "name" : "610ce580113a",\n  "cluster_name" : "docker-cluster",\n  "cluster_uuid" : "g5JRzLAmQfmbTyAawGKGfQ",\n  "version" : {\n    "number" : "7.9.2",\n    "build_flavor" : "default",\n    "build_type" : "docker",\n    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",\n    "build_date" : "2020-09-23T00:45:33.626720Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.6.2",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'

In [3]:
SKIP_INDEX_MAPPING_CREATION = False

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [4]:

df = pd.read_csv('movies.csv')
df['title'] = df['title'].map(lambda x: x.strip())
df['genres'] = df['genres'].map(lambda x: x.split('|'))
# df['year'] = df['title'].map(lambda x: x[-6:][1:-1])
df = df.astype({'title': 'str'})

df_movies = df

df = pd.read_csv('ratings.csv')
df['datetime'] = df['timestamp'].map(lambda x: datetime.datetime.fromtimestamp(x))

df_ratings = df


In [5]:
if not SKIP_INDEX_MAPPING_CREATION:
    es.indices.create(index='movies', ignore=400)
    mapping = {
                'properties': {
                    'title': {'type': 'text'}
                }
            }
    es.indices.put_mapping(body=json.dumps(mapping), index='movies', doc_type='movie', include_type_name=True)

In [6]:
for idx, row in df_movies.iterrows():
    title = row['title']
    movieId = row['movieId']
    genres = row['genres']

    entry = {
        'title': title,
        'genres': genres
    }

    es.index(index='movies', doc_type='movie', id=movieId, body=json.dumps(entry))

In [None]:
mapping = {
    'properties': {
        'datetime': {
            'type': 'date',
            'format': 'yyyy-MM-dd HH:mm:ss'
        },
    },
    # '_parent': {
    #     'type': 'movie'
    # }
}

if not SKIP_INDEX_MAPPING_CREATION:
    es.indices.create(index='ratings',ignore=400)
    es.indices.put_mapping(json.dumps(mapping), index='ratings', doc_type='rating', include_type_name=True)

In [None]:
for idx, row in df_ratings.iterrows():
    tt = row['datetime']
    rating = row['rating']
    userId = row['userId']
    movieId = row['movieId']

    entry = {
        'datetime': str(tt),
        'rating': rating,
        'userId': userId,
        'movieId': movieId
    }

    es.index(index='ratings', doc_type='rating', id=f'{userId}-{movieId}', body=json.dumps(entry))