In [1]:
## Notes from https://github.com/dylanjcastillo/random/blob/main/elasticsearch.ipynb, https://dylancastillo.co/elasticsearch-python/

In [2]:
## Index Management:
## Get a list of all indexes on Elasticsearch Server (via command line): curl http://localhost:9200/_aliases
## Delete an Index on Elasticsearch Server (via command line): curl -XDELETE localhost:9200/index_name

In [3]:
#### Step 1: Run an ES cluster via Docker
## Open a terminal and run this code to start a single-node ES cluster you can use for local development:
## docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.3.3
#### Broken Down:
## docker run                        # command you use to run an image inside -a container
## --rm                              # parameter lets Docker know to clean up the container and remove the file system when the container exits
## -p 9200:9200 -p 9300:9300         # tells Docker which ports to open on the container's network interface
## -e "xpack.security.enabled=false" # tells Docker to start with the security features disabled (this parameter should be set to true (or excluded) when running in production)
## -e "discovery.type=single-node"   # tells Docker to create a cluster with a single node

In [4]:
#### Step 2: Connect to ES cluster
from elasticsearch import Elasticsearch, RequestError

es = Elasticsearch("http://localhost:9200")
# es.info().body

In [5]:
#### Step 3: Import Dataset
import pandas as pd
from pandas import json_normalize

df = pd.read_csv('dataset.csv').dropna().sample(5000, random_state=42)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 10286 to 23728
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      5000 non-null   int64 
 1   Title             5000 non-null   object
 2   Origin/Ethnicity  5000 non-null   object
 3   Director          5000 non-null   object
 4   Cast              5000 non-null   object
 5   Genre             5000 non-null   object
 6   Wiki Page         5000 non-null   object
 7   Plot              5000 non-null   object
dtypes: int64(1), object(7)
memory usage: 351.6+ KB


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
10286,1984,Songwriter,American,Alan Rudolph,"Willie Nelson, Kris Kristofferson, Melinda Dillon",drama,https://en.wikipedia.org/wiki/Songwriter_(1984...,"The film concerns Doc Jenkins, (Willie Nelson)..."
28335,2012,Da Thadiya (ഡാ തടിയാ),Malayalam,Aashiq Abu,"Shekhar Menon, Ann Augustine, Sreenath Bhasi, ...","romance, comedy",https://en.wikipedia.org/wiki/Da_Thadiya,The film tells the love story of an obese yout...
10742,1987,Ironweed,American,Héctor Babenco,"Jack Nicholson, Meryl Streep, Tom Waits, Fred ...",drama,https://en.wikipedia.org/wiki/Ironweed_(film),Francis Phelan (Jack Nicholson) is a washed-up...
28533,2015,Kaliyachan,Malayalam,Farooq Abdul Rahman,"Manoj K Jayan, and Vaiga",drama,https://en.wikipedia.org/wiki/Kaliyachan,"The film is about the Kathakali actor, Kunhira..."
24808,1971,Kati Patang,Bollywood,Shakti Samanta,"Rajesh Khanna, Asha Parekh",romance,https://en.wikipedia.org/wiki/Kati_Patang_(197...,"Madhavi ""Madhu"" (Asha Parekh) is an orphan liv..."


In [6]:
#### Step 4: Create an Index
mappings = {'properties': {'title': {'type': 'text', 'analyzer': 'english'},
                       'ethnicity': {'type': 'text', 'analyzer': 'standard'},
                        'director': {'type': 'text', 'analyzer': 'standard'},
                            'cast': {'type': 'text', 'analyzer': 'standard'},
                           'genre': {'type': 'text', 'analyzer': 'standard'},
                            'plot': {'type': 'text', 'analyzer': 'english'},
                            'year': {'type': 'integer'},
                       'wiki_page': {'type': 'keyword'}
                          }
           }

index_name = 'movies-index'
es.indices.create(index = index_name, mappings = mappings)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7fa970ff8d00>: Failed to establish a new connection: [Errno 61] Connection refused))

In [None]:
#### Step 5: Add Dataset to Index
## Note: You can use .index() or bulk() to add data to an index
##       .index() adds one item at a time 
##        .bulk() lets you add multiple items at the same time
##############################################################
## Using .index()
# for index, row in df.iterrows():
#     doc = {
#         'title': row['Title'],
#         'ethnicity': row['Origin/Ethnicity'],
#         'director': row['Director'],
#         'cast': row['Cast'],
#         'genre': row['Genre'],
#         'plot': row['Plot'],
#         'year': row['Release Year'],
#         'wiki_page': row['Wiki Page']
#     }
            
#     es.index(index='movies', id=index, document=doc)
###################################################
## Using .bulk()
from elasticsearch.helpers import bulk
bulk_data = []
for index, row in df.iterrows():
    bulk_data.append(
        {
            '_index': index_name,
            '_id': index,
            '_source': {        
                'title': row['Title'],
                'ethnicity': row['Origin/Ethnicity'],
                'director': row['Director'],
                'cast': row['Cast'],
                'genre': row['Genre'],
                'plot': row['Plot'],
                'year': row['Release Year'],
                'wiki_page': row['Wiki Page'],
            }
        }
    )
bulk(es, bulk_data)
########################################
## Check the number of documents indexed
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')

In [None]:
#### Step 6: Make searches in your ES index
resp = es.search(
    index=index_name,
    query={
        'bool': {
            'must': {
                'match': {
                    'cast': {'query': 'jack nicholson'},
                }
            },
            'filter': {'bool': {'must_not': {'match': {'director': 'roman polanksi'}}}},
        },
    },
)

In [None]:
import ast # Allows pretty print
import json
resp = ast.literal_eval(str(resp))
print(json.dumps(resp, indent=2, sort_keys=True))

In [None]:
## Hits (Results) to DataFrame:
df = json_normalize(resp['hits']['hits'])
df.head()