In [39]:
import pandas as pd
import re
import json
import time
import requests
from requests.auth import HTTPBasicAuth
from elasticsearch import Elasticsearch
import elasticsearch
from elasticsearch.helpers import bulk

print(elasticsearch.__version__)

(8, 11, 0)


In [40]:
# !!! CUSTOMIZE THIS SECTION WITH YOUR CREDENTIALS !!!

USER = 'elastic'
PWD = 'mysecurepassword'
index_name = 'books'
ES_ENDPOINT = 'https://localhost:9200'

path_to_ca_certificates = 'N:\elesticsearch\http_ca.crt'

  path_to_ca_certificates = 'N:\elesticsearch\http_ca.crt'


### Read data

In [41]:
df = pd.read_csv('books.csv')
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,author_name,title,country,language,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14602826,"Yearsley, Ann",Poems on several occasions,England,English,1786
14602830,"A, T.",A Satyr against Vertue. (A poem: supposed to b...,England,English,1679
14602831,"A, T.","The Aeronaut, a poem; founded almost entirely,...",Ireland,English,1816
14602832,"Albert, Prince Consort, consort of Victoria, Q...","The Prince Albert, a poem",Ireland,English,1868
14602833,"Anslow, Robert","The Defeat of the Spanish Armada, A.D. 1588. A...",England,English,1888


In [42]:
#transform dataframe into json format
docs = df.to_dict(orient='records')
doc_ids = df.index
print(doc_ids[0])
print(docs[0])

14602826
{'author_name': 'Yearsley, Ann', 'title': 'Poems on several occasions', 'country': 'England', 'language': 'English', 'year': 1786}



### Elasticsearch Python wrapper

In [43]:
def create_index(es, index_name, settings=None):
    """
    Create an Elasticsearch index
    @param es: an Elasticsearch object
    @param index_name: the name of the new index to be created
    @param settings: the index settings
    @return whether the index was created
    """
    is_created = False
    try:
        if es.indices.exists(index=[index_name]):
            es.indices.delete(index=[index_name], ignore=[404])
        es.indices.create(index=index_name, settings=settings)
        is_created = True
    except Exception as ex:
        print(str(ex))
    return is_created

In [44]:
# Index settings
settings_basic = {
        "number_of_shards": 4,
        "number_of_replicas": 2,
        "analysis": {
            "analyzer": {"std_english": {"type": "standard", "stopwords": "_english_" }}
        }
}

In [45]:
#connect to the local elasticsearch node and authenticate
es = Elasticsearch(hosts=[ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))
#create an index
is_created = create_index(es, index_name, settings=settings_basic)
print(f'Index creation: {is_created}')

Index creation: True


In [46]:
#loops over the first 10 documents
for i, doc in zip(doc_ids[0:10], docs[0:10]):
    #index the documents with corresponding ids
    res = es.index(index=index_name, id=i, document=doc)
    print(res)
# see also the bulk functions for importing under: elasticsearch.helpers

{'_index': 'books', '_id': '14602826', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602830', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602831', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602832', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'books', '_id': '14602833', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'books', '_id': '14602834', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'book

In [47]:
actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)

(52695, [])

In [48]:
# get settings info of the selected index
es.indices.get_settings(index=index_name)

ObjectApiResponse({'books': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '4', 'provided_name': 'books', 'creation_date': '1771337237671', 'analysis': {'analyzer': {'std_english': {'type': 'standard', 'stopwords': '_english_'}}}, 'number_of_replicas': '2', 'uuid': 'OJisKU1XQ4OfzgPoqTDRMw', 'version': {'created': '8512000'}}}}})

In [49]:
# retrieve a document with a given ID
es.get(index=index_name, id=doc_ids[1])

ObjectApiResponse({'_index': 'books', '_id': '14602830', '_version': 2, '_seq_no': 3, '_primary_term': 1, 'found': True, '_source': {'author_name': 'A, T.', 'title': 'A Satyr against Vertue. (A poem: supposed to be spoken by a Town-Hector )', 'country': 'England', 'language': 'English', 'year': 1679}})

In [50]:
# this is how you would delete the index
es.indices.delete(index=index_name, ignore=404)

  es.indices.delete(index=index_name, ignore=404)


ObjectApiResponse({'acknowledged': True})

### Elasticsearch with python cURL (Requests)

In [51]:
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password, path_to_ca_certificates):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.path_to_ca_certificates = path_to_ca_certificates
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instance
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json,
                        verify = self.path_to_ca_certificates)
        return r

In [52]:
e = Elastic(ES_ENDPOINT, USER, PWD, path_to_ca_certificates)

In [53]:
# create another index jsut as an example. in the following, we will keep using the book index
# created using the Elasticsearch API

create_index_json={
  "mappings" : {
      "properties" : {
        "author_name" : {
          "type" : "text"
        },
        "country" : {
          "type" : "keyword"
        },
        "language" : {
          "type" : "keyword"
        },
        "title" : {
          "type" : "text"
        },
        "year" : {
          "type" : "long"
        }
      }
  },
  "settings": {
    "number_of_shards": 4, 
    "number_of_replicas": 2, 
    "index.max_result_window": 20000,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

# create an index
r = e.curl('put', index_name, json=create_index_json)
r.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'books'}

In [54]:
# get the index details and settings
r = e.curl('get', index_name)
r.json()

{'books': {'aliases': {},
  'mappings': {'properties': {'author_name': {'type': 'text'},
    'country': {'type': 'keyword'},
    'language': {'type': 'keyword'},
    'title': {'type': 'text'},
    'year': {'type': 'long'}}},
  'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'number_of_shards': '4',
    'provided_name': 'books',
    'similarity': {'default': {'type': 'BM25', 'b': '0.75', 'k1': '1.2'}},
    'max_result_window': '20000',
    'creation_date': '1771337243605',
    'analysis': {'analyzer': {'std_english': {'type': 'standard',
       'stopwords': '_english_'}}},
    'number_of_replicas': '2',
    'uuid': 'qy3eMy9gTUqSfvjhENuDkw',
    'version': {'created': '8512000'}}}}}

In [55]:
# deactivate refresh in preparation of data indexing
r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : -1}})
r.json()

{'acknowledged': True}

In [56]:
# index documents with their individual ids (use bulk below for speedup)
for doc_id, doc in list(zip(doc_ids, docs))[0:10]:
    r = e.curl('post', f'books/_doc/{doc_id}', json=doc)
r.json()

{'_index': 'books',
 '_id': '14602838',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 3, 'successful': 1, 'failed': 0},
 '_seq_no': 2,
 '_primary_term': 1}

In [57]:
# bulk indexing (via official API)

#connect to the local elasticsearch node and authenticate
es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))

actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)

(52695, [])

In [58]:
# reset the refresh interval to 2 seconds
r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : '2s'}})
r.json()

{'acknowledged': True}

In [59]:
r = e.curl('get', f'books/_doc/{doc_ids[42]}')
r.json()

{'_index': 'books',
 '_id': '14602871',
 '_version': 1,
 '_seq_no': 12,
 '_primary_term': 1,
 'found': True,
 '_source': {'author_name': 'Smedley, Edward, Fellow of Sidney Sussex College, Cambridge',
  'title': 'The Death of Saul and Jonathan. A poem',
  'country': 'England',
  'language': 'English',
  'year': 1814}}

In [60]:
# this is how you would delete the index
r = e.curl('delete', 'books')
r.json()

{'acknowledged': True}

### Search queries [EXERCISES]

#### Empty query

In [61]:
# empty query
r = e.curl('get', f'books/_search')
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Aggregation query

In [62]:
""" 
EXERCISE A: 
execute an aggregation query to count the number of books writte in each country
"""

' \nEXERCISE A: \nexecute an aggregation query to count the number of books writte in each country\n'

In [63]:
# SOLUTION TO EXERCISE A

# aggregation query
query = {
    "aggregations": {
        "by_category": {
            "terms": {
                "field": "country",
                "size":100
            }
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Full-text query

In [64]:
""" 
EXERCISE B: 
execute a full-text query for the query "love magic"
"""

' \nEXERCISE B: \nexecute a full-text query for the query "love magic"\n'

In [65]:
# SOLUTION TO EXERCISE B

# full-text query
query = {
    "query": {
        "match": {
            "title": {
                "query": "love magic"
            }
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Exact match query

In [66]:
""" 
EXERCISE C: 
execute an exact-match query for the query "magic love"
"""

' \nEXERCISE C: \nexecute an exact-match query for the query "magic love"\n'

In [67]:
# SOLUTION TO EXERCISE C

# exact match query
query = {
  "query": {
    "match_phrase": {
      "title": {
        "query": "magic love"
      }
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Multi-field full-text query with field boosting

In [68]:
""" 
EXERCISE D: 
Execute a query that searches both on tile and author
Weights the importance of matches on the author field twice as much as matches on the title
Sse "shakespeare" as query term
"""

' \nEXERCISE D: \nExecute a query that searches both on tile and author\nWeights the importance of matches on the author field twice as much as matches on the title\nSse "shakespeare" as query term\n'

In [69]:
# SOLUTION TO EXERCISE D

# Full text query, multiple fields with boosting
query = {
"query": {
    "multi_match": {
            "query": "shakespeare",
            "fields": ["title", "author_name^2"],
            "type": "phrase"
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Combining different queries

In [70]:
"""
EXERCISE E:
execute a *single* boolean query returning books that:
- have the "queen mary" in the title and were written in England
- should NOT have been published in the range of years [1850-1913]
"""

'\nEXERCISE E:\nexecute a *single* boolean query returning books that:\n- have the "queen mary" in the title and were written in England\n- should NOT have been published in the range of years [1850-1913]\n'

In [71]:
# SOLUTION TO EXERCISE E

# One or more queries can be specified in each of the clauses
# All the clauses are optional
# MUST: A document must match all of the queries
# MUST_NOT: A document must not match any of the queries
# SHOULD: A document does not have to match the queries, but it is considered more relevant if it does
# FILTER: Filters with yes/no categories
query = {
  "size" : 100,
  "query": {
    "bool": {
      "must": [
        {
          "match": {
                "title": "queen mary"
          }
        },
        {
          "match": {
              "country": "England"
          }
        }
      ],
      "must_not": [
        {
          "range": {
            "year": {
                "gte": 1850,
                "lte": 1913
            }
          }
        }
      ]
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Fuzzy queries

In [72]:
"""
EXERCISE F:
execute a fuzzy query for the query "comander" with at most 50 expansions 
and considering transpositions
"""

'\nEXERCISE F:\nexecute a fuzzy query for the query "comander" with at most 50 expansions \nand considering transpositions\n'

In [73]:
# SOLUTION TO EXERCISE F

query ={
  "query": {
    "fuzzy": {
      "title": {
        "value": "comander",
        "fuzziness": "AUTO",
        "max_expansions": 50,
        "prefix_length": 0,
        "transpositions": True,
        "rewrite": "constant_score"
      }
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Get TF of terms in document

In [74]:
"""
EXERCISE G:
get the term frequencies (in the title) of document with id = 4200
consider only words with a minimum length of 4 and a minimum term frequency of 2
"""

'\nEXERCISE G:\nget the term frequencies (in the title) of document with id = 4200\nconsider only words with a minimum length of 4 and a minimum term frequency of 2\n'

In [75]:
# SOLUTION TO EXERCISE G

# https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html
query = {
    "fields" : ["title"],
    "term_statistics" : True,
    "field_statistics" : True,
    "positions": True,
    "filter": {
        "min_word_length": 4,
        "min_term_freq" : 2
  }
}
#r = e.curl('get', f'books/_doc/{doc_ids[4200]}')
r = e.curl('get', f'books/_termvectors/{doc_ids[4200]}', query)
r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}