In [1]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch

    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import numpy as np
    
    from elasticsearch import helpers

    print("all Modules Loaded ! ")
except Exception as e:
    print("Some Modules are Missing {}".format(e))

all Modules Loaded ! 


#### Read the Dataset

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...


In [4]:
df.shape

(6234, 12)

In [5]:
df["show_id"].nunique()

6234

#### Creating a Elastic Search Instance

In [6]:
ENDPOINT = "http://localhost:9200/"
es = Elasticsearch(timeout=600,hosts=ENDPOINT)

In [7]:
es.ping()

True

#### Before uploading do some data cleaning

In [8]:
df.isna().sum()

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [11]:
df.shape

(3774, 12)

#### Convert data into a ES format

In [12]:
df2 = df.to_dict('records')

In [13]:
df2[0]

{'show_id': 81145628,
 'type': 'Movie',
 'title': 'Norm of the North: King Sized Adventure',
 'director': 'Richard Finn, Tim Maltby',
 'cast': 'Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Durupt, Maya Kay, Michael Dobson',
 'country': 'United States, India, South Korea, China',
 'date_added': 'September 9, 2019',
 'release_year': 2019,
 'rating': 'TV-PG',
 'duration': '90 min',
 'listed_in': 'Children & Family Movies, Comedies',
 'description': 'Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.'}

In [22]:
def generator(df2):
    for c, line in enumerate(df2):
        yield {
            '_index': 'myelkfirst',
            '_type': '_doc',
            '_id': line.get("show_id", None),
            '_source': {
                'title':line.get('title', ""),
                'director':line.get('director', ""),
                'description':line.get('description', ""),
                'duration':line.get('duration', None),
                'cast':line.get('cast', None)
            }
        }

In [23]:
mycustom = generator(df2)

In [24]:
mycustom

<generator object generator at 0x7f84f6b4f7b0>

In [25]:
#next(mycustom)
import json
json.dumps(next(mycustom), indent=3)

'{\n   "_index": "myelkfirst",\n   "_type": "_doc",\n   "_id": 81145628,\n   "_source": {\n      "title": "Norm of the North: King Sized Adventure",\n      "director": "Richard Finn, Tim Maltby",\n      "description": "Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.",\n      "duration": "90 min",\n      "cast": "Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Durupt, Maya Kay, Michael Dobson"\n   }\n}'

In [26]:
df2[300]

{'show_id': 80161921,
 'type': 'Movie',
 'title': 'Security',
 'director': 'Alain Desrochers',
 'cast': 'Antonio Banderas, Ben Kingsley, Liam McIntyre, Cung Le, Katherine de la Rocha, Chad Lindberg, Jiro Wang, Gabriella Wright, Shari Watson, John Strong',
 'country': 'United States',
 'date_added': 'October 3, 2017',
 'release_year': 2017,
 'rating': 'R',
 'duration': '92 min',
 'listed_in': 'Action & Adventure',
 'description': "After taking a job as a mall security guard, a former Marine steps up to protect a young girl who's being hunted by heavily armed mercenaries."}

### Upload the data into ES

In [27]:
try:
    print("Testing...")
    res = helpers.bulk(es, generator(df2))
    print("Working")
except Exception as e:
    print("Error {}".format(e))

Testing...




Working


### Settings or Mappings

In [None]:
Settings = {
    "settings" : {
        "number_of_shards" : "1",
        "number_of_replicas" : "1",
    },
    "mappings": {
        "properties": {
            "director": {
                "type": "text",
            }
        }
    }
}


In [None]:
my = es.indices.create(index='myelkfirst', ignore=[400,404], body=Settings)