# Information Retrieval with Okapi BM25
### In this notebook we will do follows:
- Create a text corpus by using descriptions of images
- Write a tokenizer (stopping, stemming, lemmatization)
- Use bm25 to rank documents for given queries

# Imports

In [None]:
import pandas as pd
from pathlib import Path
import datasets

from IPython.display import Image, JSON
from IPython.core.display import HTML
import rich
import re

import requests
import tqdm.auto


#from nltk.corpus import stopwords
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk

##### NLTK is a natural language processing toolkit that we will use removing stopwords, stemming and tokenization

##### We load the dataset

In [None]:
dset = datasets.load_from_disk("../data/processed")

##### Dataset consists of 24995 rows and each row has columns that contains information about the photo like "photo description", "size", etc..

In [None]:
dset

##### We use "description_final" field of the photos to create a text corpus

In [None]:
dset['description_final'][:5]

In [None]:
dset[0]

In [None]:
ELASTIC_HOST="localhost"
ELASTIC_INDEX="unsplash"
ELASTIC_PORT=9200

ELASTIC_FULL_URL =f"http://{ELASTIC_HOST}:{ELASTIC_PORT}"

## Elastic Search Default Analyzers and Tokenizers

### Elastic Search Analyzer

ElasticSearch has many default analyzer.

Analyzers are composed of `tokenizers` and `normalizers`.

tokenization: breaking a text down into smaller chunks 

normalizers: format the token

[ElasticDoc](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-overview.html)

[Documentation for analyzers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html)

### Built in tokenizers

In [None]:
def elastic_tokenize(tokenizer,  text, url = ELASTIC_FULL_URL+"/_analyze"):
    r =requests.post(url, 
              json =
                    {
                      "tokenizer": tokenizer ,
                      "text": text
                    }
    
    
        )

    rich.print (r.json() )
    
    
    

In [None]:
sentence = "<p> ELASTICSEARCH is built on top of the open-source <b>Apache Lucene</b>. </p>"

whitespace tokenizer

In [None]:
elastic_tokenize (tokenizer= "whitespace",  text= sentence)

standard tokenizer

In [None]:
elastic_tokenize (tokenizer= "standard",  text= sentence)

ngram tokenizer

In [None]:
elastic_tokenize (tokenizer= "ngram",  text= "Quick")

### Analyzers

In [None]:
def elastic_analyze(analyzer,  text, url = ELASTIC_FULL_URL+"/_analyze"):
    r =requests.post(url, 
              json =
                    {
                      "analyzer": analyzer ,
                      "text": text, 
                    }
        )

    rich.print (r.json() )
    

**whitespace analyzer**

The whitespace analyzer breaks text into terms whenever it encounters a whitespace character.



In [None]:
elastic_analyze(analyzer = "whitespace", text = sentence )

**stop analyzer**

breaks text into tokens at any non-letter character    
changes uppercase to lowercase.
also uses _english_ stop words.

In [None]:
elastic_analyze(analyzer = "stop", text = sentence )

**standard analyzer**

default analyzer       
grammar based tokenization
stopword disabled



In [None]:
elastic_analyze(analyzer = "standard", text = sentence )

In [None]:
sentence = "Two dogs playing in the snow"

In [None]:
elastic_analyze(analyzer = "whitespace", text = sentence )

In [None]:
elastic_analyze(analyzer = "stop", text = sentence )

In [None]:
elastic_analyze(analyzer = "standard", text = sentence )

In [None]:
elastic_analyze(analyzer = "simple", text = sentence )

In [None]:
elastic_analyze(analyzer = "english", text = sentence )

## Elastic Search Indexing

### Helper Code

In [None]:
def create_index(client,index:str, num_shards=3):
    """Creates an index in Elasticsearch. Delete old index."""
    
    if client.indices.exists(index=index):
        client.indices.delete(index=index)
    
    client.indices.create(
        index=index
        ,settings = {"number_of_shards": num_shards}
            # "mappings": {
            #     "properties": {
            #         "name": {"type": "text"},
            #         "borough": {"type": "keyword"},
            #         "cuisine": {"type": "keyword"},
            #         "grade": {"type": "keyword"},
            #         "location": {"type": "geo_point"},
            #     }
            # },
       
        #,ignore=400
    )


def generate_docs(df:pd.DataFrame):
    """
    Given a datframe containing posts data, yields a generator of dicitionary 
    """
    
    # iterate over dataframe contains posts with metadata
    for index, row in df.iterrows():
        doc = {**row} 
        
        # use PostId as document id
        doc['_id'] = doc["photo_id"]
        
        for k in list(doc.keys()):
            # don't insert nan fields
            if type(doc[k]) !=list and (doc[k] ==None or  ( pd.isna( doc[k] )  )) :
                del doc[k]
        
        yield doc
        


def fetch_results(client:Elasticsearch, query:str,  num_hits=5, fields = ["description_final"], analyzer ="stop"):
    """
    With the passed elastic search client, return documents that contain the passed `query` in the fields specified by `fields`

    If the fields is empty, it will search all text fields
    
    We are using mult-match, which by default uses `or`
    https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
    """



    
    resp = client.search(
        query= {
                "multi_match": {
                    "query": query,
                    "fields": fields,
                     "analyzer": analyzer
                   # "operator": "and" 
                },
               
            }
        ,size = num_hits
    )
    
    return resp
    

        

### Index Documents


In [None]:
client = Elasticsearch(
    [ELASTIC_FULL_URL]
)

tell elastic search to create an index     
An ES index is a collection of documents. 

ES suports inferring the documents without specifying the schema before hand 

In [None]:
create_index(client, index= ELASTIC_INDEX, num_shards=1)

In [None]:
?client.indices.create

In [None]:
requests.get(f"{ELASTIC_FULL_URL}/_all/_settings").json()

The index we created is composed of `3` shards and `1` replica.   

When searching , ES queries each shard independantly and combines it

In [None]:
len(dset)

In [None]:
df_subset = dset.to_pandas()
number_of_docs = len(df_subset)

Bulk insert all of our documents

In [None]:
df_subset

In [None]:
df_subset.iloc[0].to_dict()

In [None]:
with tqdm.auto.tqdm(total=number_of_docs , unit="docs" ) as pbar:
    successes = 0


    for ok, action in streaming_bulk(
            client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
        ):
        pbar.update(1)
        successes += ok


Inserting `20k` documents at `3000` docs/sec on a single node is pretty good

In [None]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/_cat/shards/{ELASTIC_INDEX}?v=true").content.decode()
    
)

In [None]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/_cat/nodes?v=true").content.decode()
    
)

In [None]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/{ELASTIC_INDEX}/_mapping").json()
    
)

note that by default, the default schema for text content stored content as full text and keywords.      
It is ignored as keyword, if the length is greater than 256 tokens

[ignore_above reference](https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html)

## Evaluate

In [None]:
?client.get

getting a specific document by their id

In [None]:
resp = client.get(index=ELASTIC_INDEX, id="XMyPniM9LF0")
resp.body

retrieve a document with a query

In [None]:
query = "Two dogs playing in the snow"

In [None]:
# https://stackoverflow.com/questions/34147471/elasticsearch-how-to-search-for-a-value-in-any-field-across-all-types-in-one


resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                # "fields": ["Title", "QuestionBody"],
                            }
            }
    , size=5
    , explain=False
)

In [None]:
JSON(resp.body, expanded = True)

### Explain the score

In [None]:
query

In [None]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    , explain=True
    , source = ["description_final"]
)

In [None]:
JSON (resp.body , expanded=True)

#print ( json.dumps(resp.body, indent=2) )

in the `hits.hits['idx']['_expanation']` , we see individual score computed for each of the components that make BM25
```
weight(Title:pandas in 35543) [PerFieldSimilarity], result of:"
```

In [None]:
# resp = client.search(
#     query = {
#         "bool" : {
#           "must" : {
#             "multi_match" : { "query" : query, "fields": ["Title"] }
#           },
#           "filter": {
#             "term" : { "_id" : "55047745" }
#           }
#         }
#       }

#     , size=2
#     , explain=True
#     , source = ["Title"]
# )

### Distributed tf-idf

we are running an elastic search cluster with three shards.

ES has two ways to compute the distributed term frequencies



`query_then_fetch`     
(Default) Distributed term frequencies are calculated locally for each shard running the search.    

We recommend this option for faster searches with potentially less accurate scoring.

`dfs_query_then_fetch`    
Distributed term frequencies are calculated globally, using information gathered from all shards running the search.   
While this option increases the accuracy of scoring, it adds a round-trip to each shard, which can result in slower searches.

taken from ES [docs](https://www.elastic.co/guide/en/elasticsearch/reference/8.4/search-search.html)

searching with the default mode

In [None]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    #, explain=True
    , source = ["description_final"]
    , search_type = "query_then_fetch"
)

In [None]:
JSON (resp.body , expanded=True)


searching with the global dfs mode

In [None]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    #, explain=True
    , source = ["description_final"]
    , search_type = "dfs_query_then_fetch"
)

In [None]:
JSON (resp.body , expanded=True)


score difference between the different search types

before: 14.775831, 13.5637
    
after:  14.706409, 13.708656

##### Lets go over the method below. It gets the search query and k value that is the recall limit.
- Stop, stem and tokenize the query
- Get bm25 scores of the documents
- Sort the documents by bm25 scores and get top k

In [None]:
def fetch_results(client:Elasticsearch, query:str,  num_hits=5, fields = ["description_final"], analyzer ="stop"):
    """
    With the passed elastic search client, return documents that contain the passed `query` in the fields specified by `fields`

    If the fields is empty, it will search all text fields
    
    We are using mult-match, which by default uses `or`
    https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
    """



    
    resp = client.search(
        query= {
                "multi_match": {
                    "query": query,
                    "fields": fields,
                     "analyzer": analyzer,
                 #  "operator": "and" 
                },
               
            }
        ,size = num_hits
    )
    
    return resp
    

In [None]:
def find_results(query:str , k =5, analyzer="english"):
    
    
    top_items = fetch_results(client,query=query,num_hits=k, analyzer=analyzer)
    
    

    display(HTML(f"<h1>Query: {query} </h1>"))
    
    # Iterate over the top k results
    for hit in top_items['hits']['hits']:
        doc_id = hit['_id']
        
        photo_data = hit["_source"]
        
        # Display the photo
        display(Image(url=photo_data["photo_image_url"] + "?w=200"))

        # Display the attribution text
        display(HTML(f"""
                     Photo title: {photo_data["description_final"]}   <br/>
                     Photo by <a href="https://unsplash.com/@{photo_data["photographer_username"]}?utm_source=SearchWorkshop&utm_medium=referral">{photo_data["photographer_first_name"]} {photo_data.get("photographer_last_name","")}</a> on <a href="https://unsplash.com/?utm_source=SearchWorkshop&utm_medium=referral">Unsplash</a> <br/>
                     Distance: {hit['_score']}
                     """
                                        ))
        print()

In [None]:
find_results( "Two dogs playing in the snow", analyzer="english")

In [None]:
find_results( "Two dogs playing in the snow", analyzer="whitespace")

In [None]:
find_results( "boy and girl on a beach")

In [None]:
find_results( "image of a man in a desert")

In [None]:
find_results( "light at the end of the tunnel")



In [None]:
%%timeit
search_query = "Two dogs playing in the snow"
k =5 
top_items = fetch_results(client,query=query,num_hits=k)

