# Prequisites 
 create and Elastic search domain using Elastic search services in AWS
 
 I changed follwoing settings and kept rest default (as it is). 
 * From *Choose deployment type* chose *Development and testing*
 * Can give any name to domain as long it satisfies the constrains given on that page
 * From *Network configuration* chose *Public*
 * From *Fine–grained access control – powered by Open Distro for Elasticsearch* chose *Create master user* and give any Master name and password. 
 * From *Access policy* select *Allow open access to the domain*
 
 In _settings.py file copy the Master username and password

In [1]:
# make sure following are installed before importing, if not install using pip
import requests
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
from _settings import *
import json

In [16]:
host = 'search-irspring2021-5dfrmee6j25yj4rv65kudho3a4.us-west-2.es.amazonaws.com'
# host is url given as Endpoint of Domain you created, you do not need to write https
region = 'us-west-2' 
# can be seen from top right of aws console or also given in end point url for example us-west-2 in this case

In [17]:
# only used on AWS access keys are used
#service = 'es'
#credentials = boto3.Session().get_credentials()
#awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)


In [18]:
es = Elasticsearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = (MASTER, PASSWORD), # change to awsauth if authenticating in that way
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [19]:
# data
employee1 = {
    'first_name': 'john',
    'last_name':'smith',
    'age': 25,
    'about': 'I love rock climbing',
    'interests': ['sports', 'music']       
}
employee2 = {
    'first_name': 'jane',
    'last_name':'smith',
    'age': 32,
    'about': 'I like to collect rock albums',
    'interests': ['music']       
}

employee3 = {
    'first_name': 'jenny',
    'last_name':'smith',
    'age': 20,
    'about': 'I love rock climbing and playing football',
    'interests': ['sports', 'music']       
}

In [20]:
# delete index if already exist 
es.indices.delete(index='megacorp', ignore=[400, 404])

{'acknowledged': True}

In [21]:
# index the data
es.index(index="megacorp", doc_type="employee", id="1", body=employee1)
es.index(index="megacorp", doc_type="employee", id="2", body=employee2)
es.index(index="megacorp", doc_type="employee", id="3", body=employee3)

{'_index': 'megacorp',
 '_type': 'employee',
 '_id': '3',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [23]:
# get all documents from index
results= es.search(index='megacorp')
print(json.dumps(results, indent=1))

{
 "took": 4,
 "timed_out": false,
 "_shards": {
  "total": 5,
  "successful": 5,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 3,
   "relation": "eq"
  },
  "max_score": 1.0,
  "hits": [
   {
    "_index": "megacorp",
    "_type": "employee",
    "_id": "3",
    "_score": 1.0,
    "_source": {
     "first_name": "jenny",
     "last_name": "smith",
     "age": 20,
     "about": "I love rock climbing and playing football",
     "interests": [
      "sports",
      "music"
     ]
    }
   },
   {
    "_index": "megacorp",
    "_type": "employee",
    "_id": "2",
    "_score": 1.0,
    "_source": {
     "first_name": "jane",
     "last_name": "smith",
     "age": 32,
     "about": "I like to collect rock albums",
     "interests": [
      "music"
     ]
    }
   },
   {
    "_index": "megacorp",
    "_type": "employee",
    "_id": "1",
    "_score": 1.0,
    "_source": {
     "first_name": "john",
     "last_name": "smith",
     "age": 25,
     "about": "I love roc

In [24]:
# query to get document by id
results= es.get(index="megacorp", doc_type="employee", id="3")
print(json.dumps(results, indent=1))

{
 "_index": "megacorp",
 "_type": "employee",
 "_id": "3",
 "_version": 1,
 "_seq_no": 0,
 "_primary_term": 1,
 "found": true,
 "_source": {
  "first_name": "jenny",
  "last_name": "smith",
  "age": 20,
  "about": "I love rock climbing and playing football",
  "interests": [
   "sports",
   "music"
  ]
 }
}


In [25]:
# match 
results= es.search(index='megacorp', explain='true', body={'query': {'match': {'about':"rock climbing"}}})
print(json.dumps(results, indent=1))

{
 "took": 7,
 "timed_out": false,
 "_shards": {
  "total": 5,
  "successful": 5,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 3,
   "relation": "eq"
  },
  "max_score": 0.5753642,
  "hits": [
   {
    "_shard": "[megacorp][0]",
    "_node": "raalgszSQJCD5Q6bihCAbw",
    "_index": "megacorp",
    "_type": "employee",
    "_id": "3",
    "_score": 0.5753642,
    "_source": {
     "first_name": "jenny",
     "last_name": "smith",
     "age": 20,
     "about": "I love rock climbing and playing football",
     "interests": [
      "sports",
      "music"
     ]
    },
    "_explanation": {
     "value": 0.5753642,
     "description": "sum of:",
     "details": [
      {
       "value": 0.2876821,
       "description": "weight(about:rock in 0) [PerFieldSimilarity], result of:",
       "details": [
        {
         "value": 0.2876821,
         "description": "score(freq=1.0), computed as boost * idf * tf from:",
         "details": [
          {
           "value":

In [26]:
# phrase match
results= es.search(index="megacorp", explain='true', body={"query": {"match_phrase": {'about':"rock climbing"}}})
print(json.dumps(results, indent=1))

{
 "took": 9,
 "timed_out": false,
 "_shards": {
  "total": 5,
  "successful": 5,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 2,
   "relation": "eq"
  },
  "max_score": 0.5753642,
  "hits": [
   {
    "_shard": "[megacorp][0]",
    "_node": "raalgszSQJCD5Q6bihCAbw",
    "_index": "megacorp",
    "_type": "employee",
    "_id": "3",
    "_score": 0.5753642,
    "_source": {
     "first_name": "jenny",
     "last_name": "smith",
     "age": 20,
     "about": "I love rock climbing and playing football",
     "interests": [
      "sports",
      "music"
     ]
    },
    "_explanation": {
     "value": 0.5753642,
     "description": "weight(about:\"rock climbing\" in 0) [PerFieldSimilarity], result of:",
     "details": [
      {
       "value": 0.5753642,
       "description": "score(freq=1.0), computed as boost * idf * tf from:",
       "details": [
        {
         "value": 2.2,
         "description": "boost",
         "details": []
        },
        {
    

Notice that N and n  while finding idf are 1, this is because these values are caculated  from each shard, not from whole dataset.

This will not be issue if data is large, idf accross shards ~= to IDF for all data

However, if you want to use IDF for whole data add *search_type='dfs_query_then_fetch'* paramter to search request 

In [27]:
# match with search_type='dfs_query_then_fetch'
results= es.search(index='megacorp', explain='true', search_type='dfs_query_then_fetch', body={'query': {'match': {'about':"rock climbing"}}})
print(json.dumps(results, indent=1))

{
 "took": 8,
 "timed_out": false,
 "_shards": {
  "total": 5,
  "successful": 5,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 3,
   "relation": "eq"
  },
  "max_score": 0.6860854,
  "hits": [
   {
    "_shard": "[megacorp][4]",
    "_node": "raalgszSQJCD5Q6bihCAbw",
    "_index": "megacorp",
    "_type": "employee",
    "_id": "1",
    "_score": 0.6860854,
    "_source": {
     "first_name": "john",
     "last_name": "smith",
     "age": 25,
     "about": "I love rock climbing",
     "interests": [
      "sports",
      "music"
     ]
    },
    "_explanation": {
     "value": 0.6860854,
     "description": "sum of:",
     "details": [
      {
       "value": 0.15179557,
       "description": "weight(about:rock in 0) [PerFieldSimilarity], result of:",
       "details": [
        {
         "value": 0.15179557,
         "description": "score(freq=1.0), computed as boost * idf * tf from:",
         "details": [
          {
           "value": 2.2,
           "de

In [28]:
# match_phrase with search_type='dfs_query_then_fetch'
results= es.search(index='megacorp', explain='true', search_type='dfs_query_then_fetch', body={'query': {'match_phrase': {'about':"rock climbing"}}})
print(json.dumps(results, indent=1))

{
 "took": 7,
 "timed_out": false,
 "_shards": {
  "total": 5,
  "successful": 5,
  "skipped": 0,
  "failed": 0
 },
 "hits": {
  "total": {
   "value": 2,
   "relation": "eq"
  },
  "max_score": 0.68608546,
  "hits": [
   {
    "_shard": "[megacorp][4]",
    "_node": "raalgszSQJCD5Q6bihCAbw",
    "_index": "megacorp",
    "_type": "employee",
    "_id": "1",
    "_score": 0.68608546,
    "_source": {
     "first_name": "john",
     "last_name": "smith",
     "age": 25,
     "about": "I love rock climbing",
     "interests": [
      "sports",
      "music"
     ]
    },
    "_explanation": {
     "value": 0.68608546,
     "description": "weight(about:\"rock climbing\" in 0) [PerFieldSimilarity], result of:",
     "details": [
      {
       "value": 0.68608546,
       "description": "score(freq=1.0), computed as boost * idf * tf from:",
       "details": [
        {
         "value": 2.2,
         "description": "boost",
         "details": []
        },
        {
         "value": 0.60