# Elasticsearch: indexing and queries

### Resources

* https://marcobonzanini.com/2015/06/22/tuning-relevance-in-elasticsearch-with-custom-boosting/
* https://readthedocs.org/projects/elasticsearch-dsl/downloads/pdf/stable/
* https://www.elastic.co/blog/easier-relevance-tuning-elasticsearch-7-0

In [1]:
# Import dependencies
import os
import json
import sys
from elasticsearch import Elasticsearch
import wikipediaapi
from slugify import slugify
from pprint import pprint
import wikipedia

### Search Wikipedia articles by category and add to Elasticsearch database

In [2]:
client = Elasticsearch("http://localhost:9200")

In [3]:
wiki_wiki = wikipediaapi.Wikipedia('en')

In [4]:
category = "Coronaviridae"

In [5]:
def print_categorymembers(categorymembers, level=0, max_level=1):
        for c in categorymembers.values():
            print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            if c.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
                print_categorymembers(c.categorymembers, level=level + 1, max_level=max_level)


cat = wiki_wiki.page(f"Category:{category}")
print_categorymembers(cat.categorymembers)

*: Coronaviridae (ns: 0)
*: Coronavirus (ns: 0)
*: Bat-borne virus (ns: 0)
*: Chinese coronavirus (ns: 0)
*: Coronavirinae (ns: 0)
*: Coronavirus 3' UTR pseudoknot (ns: 0)
*: Coronavirus frameshifting stimulation element (ns: 0)
*: Coronavirus packaging signal (ns: 0)
*: Coronavirus SL-III cis-acting replication element (ns: 0)
*: Novel coronavirus (ns: 0)
*: Coronavirus disease (ns: 0)
*: SHC014-CoV (ns: 0)
*: Slippery sequence (ns: 0)
*: Category:Alphacoronaviruses (ns: 14)
**: Alphacoronavirus (ns: 0)
**: Canine coronavirus (ns: 0)
**: Duvinacovirus (ns: 0)
**: Feline coronavirus (ns: 0)
**: Feline infectious peritonitis virus (ns: 0)
**: Human coronavirus 229E (ns: 0)
**: Human coronavirus NL63 (ns: 0)
**: Miniopterus bat coronavirus 1 (ns: 0)
**: Miniopterus bat coronavirus HKU8 (ns: 0)
**: Minunacovirus (ns: 0)
**: Pedacovirus (ns: 0)
**: Porcine epidemic diarrhea virus (ns: 0)
**: Rhinacovirus (ns: 0)
**: Rhinolophus bat coronavirus HKU2 (ns: 0)
**: Scotophilus bat coronavirus 5

In [6]:
class Document:
    
    def __init__(self, title, text, source, page_id, category=category):

        self.category = category
        self.title = title
        self.text = text
        self.source = source
        self.page_id = page_id
        
        self.body = {"title": self.title,
                     "text": self.text,
                     "source":self.source,
                     "page_id": self.page_id}

    def insert(self):
        
        slug = slugify(self.category)
        
        if client.search(index=slug, 
                         body={"query": 
                                 {"match": 
                                  {"page_id": self.page_id}
                                 }
                                }) == None:
            try:
        
                
                client.index(index=slug, body=self.body)

            except Exception as error:
                print(f"Could not create a JSON entry for an article {self.source}")
                
        else:
            print(f"Article {self.source} is already in the database")

In [8]:
def search_insert_wiki(category):
    
    if type(category) is not list: category = [ category ]

    wiki_wiki = wikipediaapi.Wikipedia('en')
    
    for c in category:

        cat = wiki_wiki.page(f"Category:{c}")

        for key in cat.categorymembers.keys():
            page = wiki_wiki.page(key)

            if not "Category:" in page.title:
                
                doc = Document(page.title, page.text, page.fullurl, page.pageid, category=c)
#                 for i in doc.text.split('\n')[:1]: print(i)
                doc.insert()
#                 print(f'{doc.title} is entered into elasticsearch database')
                break

In [9]:
search_insert_wiki(category)

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [coronaviridae]', coronaviridae, index_or_alias)

### Find keywords

In [10]:
print(wikipedia.search("Stan Lee"))

['Stan Lee', 'List of cameo appearances by Stan Lee', 'Stan Lee (disambiguation)', "Stan Lee's Superhumans", 'Stan Lee Media', 'Joan Boocock Lee', 'Stan Lee Foundation', 'Linda Lee Cadwell', "Stan Lee's Mighty 7", 'Stan Lee Meets...']


### Return the list of all indices

In [11]:
client.indices.get_alias("*")

{'kibana_sample_data_ecommerce': {'aliases': {}},
 'american-science-fiction-television-series': {'aliases': {}},
 'marvel-comics-editors-in-chief': {'aliases': {}},
 '.kibana_1': {'aliases': {'.kibana': {}}},
 '.apm-agent-configuration': {'aliases': {}},
 'american-comics-writers': {'aliases': {}},
 'machine-learning': {'aliases': {}},
 'science-fiction-television': {'aliases': {}},
 'presidents-of-the-united-states': {'aliases': {}},
 'marvel-comics': {'aliases': {}},
 '.kibana_task_manager_1': {'aliases': {'.kibana_task_manager': {}}},
 'kibana_sample_data_flights': {'aliases': {}},
 'natural-language-processing': {'aliases': {}}}

### Basic free text search query

In [12]:
# Example document search:

def search(text, index=''):
    return client.search(index=index, 
                         body={"query": 
                                 {"match": 
                                  {"text": text}
                                 }
                                }
    )

### Check if the wikipedia article already exists in the database

In [10]:
def check_page_id(page_id, index=''):
    return client.search(index=index, 
                         body={"query": 
                                 {"match": 
                                  {"page_id": page_id}
                                 }
                                }
    )

In [18]:
def test_pageid():
    res = check_page_id('534366')
    
    print('Relevant articles:')
    print('===================')
    for doc in res['hits']['hits']:
        print(doc['_source']['text'].split('\n\n')[1])
        print(doc['_source']['page_id'])

x = test_pageid()
if x==None:
    print("Caught!")

Relevant articles:
Early life and career
Obama was born on August 4, 1961, at Kapiolani Medical Center for Women and Children in Honolulu, Hawaii. He is the only president born outside the contiguous 48 states. He was born to an American mother of European descent and an African father. His mother, Ann Dunham (1942–1995), was born in Wichita, Kansas; she was mostly of English descent, with some German, Irish, Scottish, Swiss, and Welsh ancestry. (In July 2012, Ancestry.com found a strong likelihood that Stanley Ann Dunham was descended from John Punch, an enslaved African man who lived in the Colony of Virginia during the seventeenth century.) His father, Barack Obama Sr. (1936–1982), was a married Luo Kenyan from Nyang'oma Kogelo. Obama's parents met in 1960 in a Russian language class at the University of Hawaii at Manoa, where his father was a foreign student on a scholarship. The couple married in Wailuku, Hawaii, on February 2, 1961, six months before Obama was born.In late August

### Improving relevance of search results

In [28]:
question = "When Barack Obama was inaugurated?"

In [29]:
def test_search(question):
    res = search(text=question)

    print('Relevant articles:')
    print('===================')

    for i, doc in enumerate(res['hits']['hits']):
        print(f'Result {i}:')
        print(f"Title: {doc['_source']['title']}")    
        print(f"Relevance score {doc['_score']}")

test_search(question)

Relevant articles:
Result 0:
Title: Jeff Mariotte
Relevance score 12.389853
Result 1:
Title: Named entity
Relevance score 12.173271
Result 2:
Title: Ta-Nehisi Coates
Relevance score 11.393703
Result 3:
Title: Amber Benson
Relevance score 9.310086
Result 4:
Title: Eric Millikin
Relevance score 8.667373
Result 5:
Title: Jason Rubin
Relevance score 8.582391
Result 6:
Title: Information extraction
Relevance score 8.356649
Result 7:
Title: Rashida Jones
Relevance score 7.7038317
Result 8:
Title: Barack Obama
Relevance score 7.4432054
Result 9:
Title: Alex Ross
Relevance score 6.7125754


### Dynamic boosting

In [30]:
query_body = \
    {
      "query": {
        "bool": {
          "should": [
            {
              "match": {
                "title": {
                  "query": question,
                  "boost": 3
                }
              }
            },
            {
              "match": { 
                "text": question
              }
            }
          ]
        }
      }
    }

In [31]:
search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Barack Obama
Relevance score 31.020813
Result 1:
Title: Jeff Mariotte
Relevance score 12.389853
Result 2:
Title: Named entity
Relevance score 12.173271
Result 3:
Title: Ta-Nehisi Coates
Relevance score 11.393703
Result 4:
Title: Amber Benson
Relevance score 9.310086
Result 5:
Title: Eric Millikin
Relevance score 8.667373
Result 6:
Title: Jason Rubin
Relevance score 8.582391
Result 7:
Title: Information extraction
Relevance score 8.356649
Result 8:
Title: Rashida Jones
Relevance score 7.7038317
Result 9:
Title: Alex Ross
Relevance score 6.7125754


In [32]:
query_body = \
    {
      "query": {
        "bool": {
          "should": [
            {
              "match": {
                "title": {
                  "query": question,
                  "boost": 3
                }
              }
            },
            {
              "match": { 
                "text": question
              }
            }
          ]
        }
      }
    }

search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Barack Obama
Relevance score 31.020813
Result 1:
Title: Jeff Mariotte
Relevance score 12.389853
Result 2:
Title: Named entity
Relevance score 12.173271
Result 3:
Title: Ta-Nehisi Coates
Relevance score 11.393703
Result 4:
Title: Amber Benson
Relevance score 9.310086
Result 5:
Title: Eric Millikin
Relevance score 8.667373
Result 6:
Title: Jason Rubin
Relevance score 8.582391
Result 7:
Title: Information extraction
Relevance score 8.356649
Result 8:
Title: Rashida Jones
Relevance score 7.7038317
Result 9:
Title: Alex Ross
Relevance score 6.7125754


### Multi-match search without boosting

In [18]:
query_body = \
{
  "query": {
    "multi_match" : {
      "query":    "Barak Obama", 
      "fields": [ "title", "text" ] 
    }
  }
}

search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Ta-Nehisi Coates
Relevance score 6.412328
Result 1:
Title: Bill Clinton
Relevance score 6.4054484
Result 2:
Title: Jeff Mariotte
Relevance score 5.983801
Result 3:
Title: Sean Murphy (artist)
Relevance score 5.7749376
Result 4:
Title: Open information extraction
Relevance score 5.435918
Result 5:
Title: Named entity
Relevance score 4.9300184
Result 6:
Title: Information extraction
Relevance score 3.9993434
Result 7:
Title: Rashida Jones
Relevance score 3.972412
Result 8:
Title: Barack Obama
Relevance score 3.9296012
Result 9:
Title: Amber Benson
Relevance score 3.8804646


### Multi-match search with boosting

In [78]:
query_body = \
{
  "query": {
    "multi_match" : {
      "query":    question, 
      "fields": [ "title^3", "text" ] 
    }
  }
}

search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Stan Hart
Relevance score 18.541727
Result 1:
Title: Stan Sakai
Relevance score 18.541727
Result 2:
Title: Lee Falk
Relevance score 15.098925
Result 3:
Title: Jim Lee
Relevance score 15.098925
Result 4:
Title: Elaine Lee
Relevance score 15.098925
Result 5:
Title: Jae Lee
Relevance score 15.098925
Result 6:
Title: Lee Kohse
Relevance score 15.098925
Result 7:
Title: Lee Weeks
Relevance score 15.098925
Result 8:
Title: Stan Lee
Relevance score 14.202862
Result 9:
Title: Jen Lee (cartoonist)
Relevance score 12.673573


In [99]:
### Multi-match term

query_body = \
    {
  "query": {
      "bool": {
          "must": [
              {"match": {"title": "when did stan lee become editor-in chief"}},
              {"match": {"text": "when did stan lee become editor-in chief"}}
              ]
          }
      }
    }
search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Jim Lee
Relevance score 16.174313
Result 1:
Title: Lee Falk
Relevance score 15.261215
Result 2:
Title: Ralph Macchio (editor)
Relevance score 13.557625
Result 3:
Title: Stan Hart
Relevance score 11.0070915
Result 4:
Title: Stan Sakai
Relevance score 10.720789
Result 5:
Title: Jae Lee
Relevance score 10.716596
Result 6:
Title: Elaine Lee
Relevance score 10.5024185
Result 7:
Title: Jen Lee (cartoonist)
Relevance score 10.088491
Result 8:
Title: Stan Lee
Relevance score 9.549194
Result 9:
Title: Machine learning in video games
Relevance score 9.171778


In [93]:

query_body = \
    {
      "query": {"match_phrase": 
                    {"title": "stan lee"}
               }
    }
search_result = client.search(index='', body=query_body)
for i, doc in enumerate(search_result['hits']['hits']):
    print(f'Result {i}:')
    print(f"Title: {doc['_source']['title']}")    
    print(f"Relevance score {doc['_score']}")

Result 0:
Title: Stan Lee
Relevance score 4.7342873


In [None]:
query_body = \
    {
        "query": {
            "match_phrase" : {
                "message" : {
                    "query" : "this is a test",
                    "analyzer" : "my_analyzer"
                }
            }
        }
    }