In [1]:
## Notes from https://github.com/dylanjcastillo/random/blob/main/elasticsearch.ipynb, https://dylancastillo.co/elasticsearch-python/

In [2]:
## Index Management:
## Get a list of all indexes on Elasticsearch Server (via command line): curl http://localhost:9200/_aliases
## Delete an Index on Elasticsearch Server (via command line): curl -XDELETE localhost:9200/index_name

In [3]:
#### Step 1: Run an ES cluster via Docker
## Open a terminal and run this code to start a single-node ES cluster you can use for local development:
## docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.3.3
#### Broken Down:
## docker run                        # command you use to run an image inside -a container
## --rm                              # parameter lets Docker know to clean up the container and remove the file system when the container exits
## -p 9200:9200 -p 9300:9300         # tells Docker which ports to open on the container's network interface
## -e "xpack.security.enabled=false" # tells Docker to start with the security features disabled (this parameter should be set to true (or excluded) when running in production)
## -e "discovery.type=single-node"   # tells Docker to create a cluster with a single node

In [4]:
#### Step 2: Connect to ES cluster
from elasticsearch import Elasticsearch, RequestError

es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '1b55e5baa9bb',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'Uz-IFyzQQAycRbesto7qfA',
 'version': {'number': '8.3.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '801fed82df74dbe537f89b71b098ccaff88d2c56',
  'build_date': '2022-07-23T19:30:09.227964828Z',
  'build_snapshot': False,
  'lucene_version': '9.2.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [5]:
#### Step 3: Import Dataset
import pandas as pd
from pandas import json_normalize

df = pd.read_csv('dataset.csv').dropna().sample(5000, random_state=42)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 10286 to 23728
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      5000 non-null   int64 
 1   Title             5000 non-null   object
 2   Origin/Ethnicity  5000 non-null   object
 3   Director          5000 non-null   object
 4   Cast              5000 non-null   object
 5   Genre             5000 non-null   object
 6   Wiki Page         5000 non-null   object
 7   Plot              5000 non-null   object
dtypes: int64(1), object(7)
memory usage: 351.6+ KB


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
10286,1984,Songwriter,American,Alan Rudolph,"Willie Nelson, Kris Kristofferson, Melinda Dillon",drama,https://en.wikipedia.org/wiki/Songwriter_(1984...,"The film concerns Doc Jenkins, (Willie Nelson)..."
28335,2012,Da Thadiya (ഡാ തടിയാ),Malayalam,Aashiq Abu,"Shekhar Menon, Ann Augustine, Sreenath Bhasi, ...","romance, comedy",https://en.wikipedia.org/wiki/Da_Thadiya,The film tells the love story of an obese yout...
10742,1987,Ironweed,American,Héctor Babenco,"Jack Nicholson, Meryl Streep, Tom Waits, Fred ...",drama,https://en.wikipedia.org/wiki/Ironweed_(film),Francis Phelan (Jack Nicholson) is a washed-up...
28533,2015,Kaliyachan,Malayalam,Farooq Abdul Rahman,"Manoj K Jayan, and Vaiga",drama,https://en.wikipedia.org/wiki/Kaliyachan,"The film is about the Kathakali actor, Kunhira..."
24808,1971,Kati Patang,Bollywood,Shakti Samanta,"Rajesh Khanna, Asha Parekh",romance,https://en.wikipedia.org/wiki/Kati_Patang_(197...,"Madhavi ""Madhu"" (Asha Parekh) is an orphan liv..."


In [6]:
#### Step 4: Create an Index
mappings = {'properties': {'title': {'type': 'text', 'analyzer': 'english'},
                       'ethnicity': {'type': 'text', 'analyzer': 'standard'},
                        'director': {'type': 'text', 'analyzer': 'standard'},
                            'cast': {'type': 'text', 'analyzer': 'standard'},
                           'genre': {'type': 'text', 'analyzer': 'standard'},
                            'plot': {'type': 'text', 'analyzer': 'english'},
                            'year': {'type': 'integer'},
                       'wiki_page': {'type': 'keyword'}
                          }
           }

index_name = 'movies-index'
es.indices.create(index = index_name, mappings = mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies-index'})

In [7]:
#### Step 5: Add Dataset to Index
## Note: You can use .index() or bulk() to add data to an index
##       .index() adds one item at a time 
##        .bulk() lets you add multiple items at the same time
##############################################################
## Using .index()
# for index, row in df.iterrows():
#     doc = {
#         'title': row['Title'],
#         'ethnicity': row['Origin/Ethnicity'],
#         'director': row['Director'],
#         'cast': row['Cast'],
#         'genre': row['Genre'],
#         'plot': row['Plot'],
#         'year': row['Release Year'],
#         'wiki_page': row['Wiki Page']
#     }
            
#     es.index(index='movies', id=index, document=doc)
###################################################
## Using .bulk()
from elasticsearch.helpers import bulk
bulk_data = []
for index, row in df.iterrows():
    bulk_data.append(
        {
            '_index': index_name,
            '_id': index,
            '_source': {        
                'title': row['Title'],
                'ethnicity': row['Origin/Ethnicity'],
                'director': row['Director'],
                'cast': row['Cast'],
                'genre': row['Genre'],
                'plot': row['Plot'],
                'year': row['Release Year'],
                'wiki_page': row['Wiki Page'],
            }
        }
    )
bulk(es, bulk_data)
########################################
## Check the number of documents indexed
es.indices.refresh(index=index_name)
es.cat.count(index=index_name, format='json')

ListApiResponse([{'epoch': '1660612072', 'timestamp': '01:07:52', 'count': '5000'}])

In [8]:
#### Step 6: Make searches in your ES index
resp = es.search(
    index=index_name,
    query={
        'bool': {
            'must': {
                'match': {
                    'cast': {'query': 'jack nicholson'},
                }
            },
            'filter': {'bool': {'must_not': {'match': {'director': 'roman polanksi'}}}},
        },
    },
)

In [10]:
import ast # Allows pretty print
import json
resp = ast.literal_eval(str(resp))
print(json.dumps(resp, indent=2, sort_keys=True))

{
  "_shards": {
    "failed": 0,
    "skipped": 0,
    "successful": 1,
    "total": 1
  },
  "hits": {
    "hits": [
      {
        "_id": "8812",
        "_index": "movies-index",
        "_score": 10.9237385,
        "_source": {
          "cast": "Jack Nicholson, Ellen Burstyn, Bruce Dern",
          "director": "Bob Rafelson",
          "ethnicity": "American",
          "genre": "drama",
          "plot": "David and Jason are estranged brothers, the former a depressive living with his grandfather in Philadelphia where he runs a late-night radio talk show and the latter an extrovert con man working for gang boss Lewis in Atlantic City, where he lives with the manic-depressive Sally, former beauty queen and prostitute, and her stepdaughter Jessica. Begging David to come to Atlantic City and bail him out of jail, Jason once freed persuades him to stay on in his hotel suite with the two women.\r\nTensions grow between the four as Jason pursues a ludicrous dream of conning a Japanes

In [11]:
## Hits (Results) to DataFrame:
df = json_normalize(resp['hits']['hits'])
df.head()

Unnamed: 0,_index,_id,_score,_source.title,_source.ethnicity,_source.director,_source.cast,_source.genre,_source.plot,_source.year,_source.wiki_page
0,movies-index,8812,10.923739,The King of Marvin Gardens,American,Bob Rafelson,"Jack Nicholson, Ellen Burstyn, Bruce Dern",drama,"David and Jason are estranged brothers, the fo...",1972,https://en.wikipedia.org/wiki/The_King_of_Marv...
1,movies-index,9140,10.923739,The Fortune,American,Mike Nichols,"Warren Beatty, Jack Nicholson, Stockard Channing",comedy,Nicky Wilson (Beatty) and Oscar Sullivan (Nich...,1975,https://en.wikipedia.org/wiki/The_Fortune
2,movies-index,10871,10.287027,The Witches of Eastwick,American,George Miller,"Susan Sarandon, Cher, Michelle Pfeiffer, Jack ...",comedy,"Alexandra Medford (Cher), Jane Spofford (Susan...",1987,https://en.wikipedia.org/wiki/The_Witches_of_E...
3,movies-index,10742,7.965581,Ironweed,American,Héctor Babenco,"Jack Nicholson, Meryl Streep, Tom Waits, Fred ...",drama,Francis Phelan (Jack Nicholson) is a washed-up...,1987,https://en.wikipedia.org/wiki/Ironweed_(film)
4,movies-index,14362,7.965581,Something's Gotta Give,American,Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Am...",romantic comedy,Harry Sanborn is a wealthy New York music mogu...,2003,https://en.wikipedia.org/wiki/Something%27s_Go...
