<a href="https://colab.research.google.com/github/mostafa-ja/Information-Retrieval/blob/main/python_elasticsearch_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture

!pip install elasticsearch==7.14.0
!apt install default-jdk > /dev/null

In [2]:
try:
  import os
  import elasticsearch
  from elasticsearch import Elasticsearch
  import numpy as np
  import pandas as pd
  import sys
  import json
  from ast import literal_eval
  from tqdm import tqdm 
  import datetime
  from elasticsearch import helpers
  
except Exception as e:
  print(f"error: {e}")

In [3]:
# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

In [4]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [5]:
# This part is important, since it takes a little amount of time for instance to load
import time
time.sleep(20)

In [6]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon       207      74 99 22:50 ?        00:00:21 /content/elasticsearch-7.0.0/jdk/bin/java -Xms1g -Xmx1g -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/tmp/elasticsearch-13325590698578125707 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Dio.netty.allocator.type=unpooled -Des.path.home=/content/elasticsearch-7.0.0 -Des.path.conf=/content/elasticsearch-7.0.0/config -Des.distribution.flavor=default

In [7]:
# Check if elasticsearch is running
!curl -sX GET "localhost:9200/"

{
  "name" : "d1c34cef11da",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "vTt4U0ZGTTOYkh6IOEaO4Q",
  "version" : {
    "number" : "7.0.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "b7e28a7",
    "build_date" : "2019-04-05T22:55:32.697037Z",
    "build_snapshot" : false,
    "lucene_version" : "8.0.0",
    "minimum_wire_compatibility_version" : "6.7.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [8]:
from elasticsearch import Elasticsearch

# Adds the HTTP header 'Authorization: Basic <base64 username:password>'
es = Elasticsearch(
    "http://localhost:9200",
    #ca_certs="/path/to/http_ca.crt",
  
)

print(es.ping())


True


In [9]:
es.info()


{'name': 'd1c34cef11da',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'vTt4U0ZGTTOYkh6IOEaO4Q',
 'version': {'number': '7.0.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': 'b7e28a7',
  'build_date': '2019-04-05T22:55:32.697037Z',
  'build_snapshot': False,
  'lucene_version': '8.0.0',
  'minimum_wire_compatibility_version': '6.7.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [10]:
# 3.Install Kaggle API.
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
# 4.Run the following code to configure the path to “kaggle.json”
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [12]:
!kaggle datasets download -d jrobischon/wikipedia-movie-plots

Downloading wikipedia-movie-plots.zip to /content
 97% 29.0M/29.9M [00:00<00:00, 98.1MB/s]
100% 29.9M/29.9M [00:00<00:00, 94.4MB/s]


In [13]:
import zipfile
with zipfile.ZipFile('/content/wikipedia-movie-plots.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

In [14]:
import pandas as pd
df = pd.read_csv('wiki_movie_plots_deduped.csv').dropna().sample(5000, random_state=42)
# .dropna() : Remove missing values.
#.sample() : Return n(here 5000) random sample of items from an axis of object.

In [15]:
df.head(2)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
10286,1984,Songwriter,American,Alan Rudolph,"Willie Nelson, Kris Kristofferson, Melinda Dillon",drama,https://en.wikipedia.org/wiki/Songwriter_(1984...,"The film concerns Doc Jenkins, (Willie Nelson)..."
28335,2012,Da Thadiya (ഡാ തടിയാ),Malayalam,Aashiq Abu,"Shekhar Menon, Ann Augustine, Sreenath Bhasi, ...","romance, comedy",https://en.wikipedia.org/wiki/Da_Thadiya,The film tells the love story of an obese yout...


In [16]:
df.shape


(5000, 8)

In [17]:
df['Genre']

10286              drama
28335    romance, comedy
10742              drama
28533              drama
24808            romance
              ...       
23671     action / crime
14874              drama
3015              comedy
15350             comedy
23728    romantic-comedy
Name: Genre, Length: 5000, dtype: object

In [19]:
request_body = {
	    

	    'mappings': {
	        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "ethnicity": {"type": "text", "analyzer": "standard"},
            "director": {"type": "text", "analyzer": "standard"},
            "cast": {"type": "text", "analyzer": "standard"},
            "genre": {"type": "text", "analyzer": "standard"},
            "plot": {"type": "text", "analyzer": "english"},
            "year": {"type": "integer"},
            "wiki_page": {"type": "keyword"}
      }}
	}

es.indices.create(index = 'example_index', body = request_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'example_index'}

In [20]:
for i, row in df.iterrows():
  print(i,'\n')
  print(row)
  break

10286 

Release Year                                                     1984
Title                                                      Songwriter
Origin/Ethnicity                                             American
Director                                                 Alan Rudolph
Cast                Willie Nelson, Kris Kristofferson, Melinda Dillon
Genre                                                           drama
Wiki Page           https://en.wikipedia.org/wiki/Songwriter_(1984...
Plot                The film concerns Doc Jenkins, (Willie Nelson)...
Name: 10286, dtype: object


In [21]:
type(row)

pandas.core.series.Series

In [22]:
row['Title']

'Songwriter'

In [23]:
for i, row in df.iterrows():
  doc = {
        "title": row["Title"],
        "ethnicity": row["Origin/Ethnicity"],
        "director": row["Director"],
        "cast": row["Cast"],
        "genre": row["Genre"],
        "plot": row["Plot"],
        "year": row["Release Year"],
        "wiki_page": row["Wiki Page"]
    }

  es.index(index="movies",id=i,body=doc)


In [32]:
es.indices.get(index='movies')

{'movies': {'aliases': {},
  'mappings': {'properties': {'cast': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'director': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'ethnicity': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'genre': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'plot': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'title': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'wiki_page': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'year': {'type': 'long'}}},
  'settings': {'index': {'creation_date': '1666997470216',
    'number_of_shards': '1',
    'number_of_replicas': '1',
    'uuid': 'GqFi9-YkRxyNqMSlzRgENQ',
    'version': {'created': '7000099'},
   

In [24]:
es.indices.create(index='tutorial3')

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'tutorial3'}

In [25]:
indices = es.indices.get_alias() 
print(indices)

{'tutorial3': {'aliases': {}}, 'movies': {'aliases': {}}, 'example_index': {'aliases': {}}}


In [26]:
es.search(index='tutorial3')


{'took': 35,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [27]:
!wget 'https://gist.githubusercontent.com/lynnkwong/c5ee4a0f4963d8c2c3281fecf32b5dae/raw/e3e1a243c69bc9653cb020360b86af4f9b5ba04b/b9ed00e0fdf0-laptops-data.csv'

--2022-10-28 22:51:52--  https://gist.githubusercontent.com/lynnkwong/c5ee4a0f4963d8c2c3281fecf32b5dae/raw/e3e1a243c69bc9653cb020360b86af4f9b5ba04b/b9ed00e0fdf0-laptops-data.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11875 (12K) [text/plain]
Saving to: ‘b9ed00e0fdf0-laptops-data.csv’


2022-10-28 22:51:53 (98.7 MB/s) - ‘b9ed00e0fdf0-laptops-data.csv’ saved [11875/11875]



In [28]:
import csv
import json

colums = ["id", "name", "price", "brand", "cpu", "memory", "storage"]
index_name = "laptops-demo"

with open("b9ed00e0fdf0-laptops-data.csv", "r") as fi:
    reader = csv.DictReader(
        fi, fieldnames=colums, delimiter=",", quotechar='"'
    )

    # This skips the first row which is the header of the CSV file.
    next(reader)

    actions = []
    for row in reader:
        action = {"index": {"_index": index_name, "_id": int(row["id"])}}
        doc = {
            "id": int(row["id"]),
            "name": row["name"],
            "price": float(row["price"]),
            "brand": row["brand"],
            "attributes": [
                {"attribute_name": "cpu", "attribute_value": row["cpu"]},
                {"attribute_name": "memory", "attribute_value": row["memory"]},
                {
                    "attribute_name": "storage",
                    "attribute_value": row["storage"],
                },
            ],
        }
        actions.append(json.dumps(action))
        actions.append(json.dumps(doc))

    with open("laptops_demo.json", "w") as fo:
        fo.write("\n".join(actions))

    es.bulk(body="\n".join(actions))


In [29]:
import requests, json, os

In [30]:
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

NameError: ignored

In [None]:
directory = '/path/to/files/'

In [None]:
i=1
for filename in os.listdir(directory):
    if filename.endswith(".json"):
        f = open(filename)
        docket_content = f.read()
        # Send the data into es
        es.index(index='myindex', ignore=400, doc_type='docket', 
        id=i, body=json.loads(docket_content))
        i = i + 1

In [None]:
# Importing test dataset
!git clone https://github.com/HamidRezaAttar/Elasticsearch-Jupyter-Colab
dataset = pd.read_csv("/content/Elasticsearch-Jupyter-Colab/data/test.csv.gz", compression="gzip")
dataset.drop("id", axis=1, inplace=True)
print(f"shape of dataset: {dataset.shape}")
dataset.head()

In [None]:
# Define settings & mappings of Elasticsearch index
Settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "article":{
                "type":"text"
            },
            "highlights":{
                "type":"text"
            }
        }
    }
}

In [None]:
def json_formatter(dataset, index_name, index_type='_doc'):
    """
    This function is used to create JSON formatted dictionaries for Elasticsearch.

    Args:
      dataset: The dataset you want to apply this function.
      index_name: Name of the index in Elasticsearch
      index_type: Type of the index in Elasticsearch.
      Note: It is suggested to keep index_type as '_doc' since it is deprecated from version 6.
      Note: This function formats all columns of your dataset, if you want to apply this to special columns only,
      you can delete the second for loop and add your custom fields.
    """
    try:
        List = []
        columns = dataset.columns
        for idx, row in dataset.iterrows():
            dic = {}
            dic['_index'] = index_name
            dic['_type'] = index_type
            source = {}
            for i in dataset.columns:
                source[i] = row[i]
            dic['_source'] = source
            List.append(dic)
        return List
    
    except Exception as e:
        print("There is a problem: {}".format(e))

In [None]:
MY_INDEX = es.indices.create(index="news_index", ignore=[400,404], body=Settings)
MY_INDEX

In [None]:
json_Formatted_dataset = json_formatter(dataset=dataset, index_name='news_index', index_type='_doc')
json_Formatted_dataset[0]

In [None]:
# For importing Data to elasticsearch we use elasticsearch's bulk API from elasticsearch.helpers
try:
    res = helpers.bulk(es, json_Formatted_dataset)
    print("successfully imported to elasticsearch.")
except Exception as e:
    print(f"error: {e}")

In [None]:
# Get 10 sample of data
query = es.search(
    index="news_index",
    body={
      "size":10,
      "query": {
        "match_all":{}
      }
    }
)

output = pd.json_normalize((query['hits']['hits']))
output

In [None]:
# Complicated query
query = es.search(
    index="news_index",
    body={
        "size":20,
        "query":{
            "bool":{
                "must":[
                        {"match":{"article":"teenage boy"}}
                ],
                "should":[
                        {"match":{"highlights":"drunk"}}
                ]
            }
        }
    }
)

output = pd.json_normalize((query['hits']['hits']))
output

In [None]:
# More complicated query
query = es.search( 
    index="news_index",
    body={
        "size":20,
        "query":{
            "bool":{
                "must":[
                        {"multi_match":{
                            "query":"The Hunger Games",
                            "fields":["article","highlights"]
                        }}
                ]
            }
        }
    }
)

output = pd.json_normalize((query['hits']['hits']))
output