# Information Retrieval with Okapi BM25
### In this notebook we will do follows:
- Create a text corpus by using descriptions of images
- Write a tokenizer (stopping, stemming, lemmatization)
- Use bm25 to rank documents for given queries

In [1]:
import pandas as pd
from pathlib import Path
import datasets

from IPython.display import Image, JSON
from IPython.core.display import HTML
import rich
import re

import requests
import tqdm.auto


#from nltk.corpus import stopwords
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk

##### NLTK is a natural language processing toolkit that we will use removing stopwords, stemming and tokenization

##### We load the dataset

In [2]:
dset = datasets.load_from_disk("../data/processed")

##### Dataset consists of 24995 rows and each row has columns that contains information about the photo like "photo description", "size", etc..

In [3]:
dset

Dataset({
    features: ['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at', 'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio', 'photo_description', 'photographer_username', 'photographer_first_name', 'photographer_last_name', 'exif_camera_make', 'exif_camera_model', 'exif_iso', 'exif_aperture_value', 'exif_focal_length', 'exif_exposure_time', 'photo_location_name', 'photo_location_latitude', 'photo_location_longitude', 'photo_location_country', 'photo_location_city', 'stats_views', 'stats_downloads', 'ai_description', 'ai_primary_landmark_name', 'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude', 'ai_primary_landmark_confidence', 'blur_hash', 'description_final', 'image'],
    num_rows: 24995
})

##### We use "description_final" field of the photos to create a text corpus

In [4]:
dset['description_final'][:5]

['Woman exploring a forest',
 'Succulents in a terrarium',
 'Rural winter mountainside',
 'Poppy seeds and flowers',
 'Silhouette near dark trees']

In [5]:
dset[0]

{'photo_id': 'XMyPniM9LF0',
 'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',
 'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',
 'photo_submitted_at': '2014-09-29 00:08:38.594364',
 'photo_featured': 't',
 'photo_width': 4272,
 'photo_height': 2848,
 'photo_aspect_ratio': 1.5,
 'photo_description': 'Woman exploring a forest',
 'photographer_username': 'michellespencer77',
 'photographer_first_name': 'Michelle',
 'photographer_last_name': 'Spencer',
 'exif_camera_make': 'Canon',
 'exif_camera_model': 'Canon EOS REBEL T3',
 'exif_iso': 400.0,
 'exif_aperture_value': '1.8',
 'exif_focal_length': '50.0',
 'exif_exposure_time': '1/100',
 'photo_location_name': None,
 'photo_location_latitude': None,
 'photo_location_longitude': None,
 'photo_location_country': None,
 'photo_location_city': None,
 'stats_views': 2375421,
 'stats_downloads': 6967,
 'ai_description': 'woman walking in the middle of forest',
 'ai_primary_landmark_name': None,
 'ai_

In [6]:
ELASTIC_HOST="localhost"
ELASTIC_INDEX="unsplash"
ELASTIC_PORT=9200

ELASTIC_FULL_URL =f"http://{ELASTIC_HOST}:{ELASTIC_PORT}"

## Elastic Search Default Analyzers and Tokenizers

### Elastic Search Analyzer

ElasticSearch has many default analyzer.

Analyzers are composed of `tokenizers` and `normalizers`.

tokenization: breaking a text down into smaller chunks 

normalizers: format the token

[ElasticDoc](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-overview.html)

[Documentation for analyzers](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html)

### Built in tokenizers

In [7]:
def elastic_tokenize(tokenizer,  text, url = ELASTIC_FULL_URL+"/_analyze"):
    r =requests.post(url, 
              json =
                    {
                      "tokenizer": tokenizer ,
                      "text": text
                    }
    
    
        )

    rich.print (r.json() )
    
    
    

In [8]:
sentence = "<p> ELASTICSEARCH is built on top of the open-source <b>Apache Lucene</b>. </p>"

whitespace tokenizer

In [9]:
elastic_tokenize (tokenizer= "whitespace",  text= sentence)

standard tokenizer

In [10]:
elastic_tokenize (tokenizer= "standard",  text= sentence)

ngram tokenizer

In [11]:
elastic_tokenize (tokenizer= "ngram",  text= "Quick")

### Analyzers

In [12]:
def elastic_analyze(analyzer,  text, url = ELASTIC_FULL_URL+"/_analyze"):
    r =requests.post(url, 
              json =
                    {
                      "analyzer": analyzer ,
                      "text": text, 
                    }
        )

    rich.print (r.json() )
    

**whitespace analyzer**

The whitespace analyzer breaks text into terms whenever it encounters a whitespace character.



In [13]:
elastic_analyze(analyzer = "whitespace", text = sentence )

**stop analyzer**

breaks text into tokens at any non-letter character    
changes uppercase to lowercase.
also uses _english_ stop words.

In [14]:
elastic_analyze(analyzer = "stop", text = sentence )

**standard analyzer**

default analyzer       
grammar based tokenization
stopword disabled



In [15]:
elastic_analyze(analyzer = "standard", text = sentence )

## Elastic Search Indexing

### Helper Code

In [46]:
def create_index(client,index:str, num_shards=3):
    """Creates an index in Elasticsearch. Delete old index."""
    
    if client.indices.exists(index=index):
        client.indices.delete(index=index)
    
    client.indices.create(
        index=index
        ,settings = {"number_of_shards": num_shards}
            # "mappings": {
            #     "properties": {
            #         "name": {"type": "text"},
            #         "borough": {"type": "keyword"},
            #         "cuisine": {"type": "keyword"},
            #         "grade": {"type": "keyword"},
            #         "location": {"type": "geo_point"},
            #     }
            # },
       
        #,ignore=400
    )


def generate_docs(df:pd.DataFrame):
    """
    Given a datframe containing posts data, yields a generator of dicitionary 
    """
    
    # iterate over dataframe contains posts with metadata
    for index, row in df.iterrows():
        doc = {**row} 
        
        # use PostId as document id
        doc['_id'] = doc["photo_id"]
        
        for k in list(doc.keys()):
            # don't insert nan fields
            if type(doc[k]) !=list and (doc[k] ==None or  ( pd.isna( doc[k] )  )) :
                del doc[k]
        
        yield doc
        


def fetch_results(client:Elasticsearch, query:str,  num_hits=5, fields = ["description_final"]):
    """
    With the passed elastic search client, return documents that contain the passed `query` in the fields specified by `fields`

    If the fields is empty, it will search all text fields
    
    We are using mult-match, which by default uses `or`
    https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
    """



    
    resp = client.search(
        query= {
                "multi_match": {
                    "query": query,
                    "fields": fields,
                   # "operator": "and" 
                }
            }
        ,size = num_hits
    )
    
    return resp
    

        

### Index Documents


In [17]:
client = Elasticsearch(
    [ELASTIC_FULL_URL]
)

tell elastic search to create an index     
An ES index is a collection of documents. 

ES suports inferring the documents without specifying the schema before hand 

In [18]:
create_index(client, index= ELASTIC_INDEX)

In [19]:
?client.indices.create

[0;31mSignature:[0m
[0mclient[0m[0;34m.[0m[0mindices[0m[0;34m.[0m[0mcreate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maliases[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merror_trace[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilter_path[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0;34m...[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[

In [20]:
requests.get(f"{ELASTIC_FULL_URL}/_all/_settings").json()

{'unsplash': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'number_of_shards': '3',
    'provided_name': 'unsplash',
    'creation_date': '1668113426018',
    'number_of_replicas': '1',
    'uuid': 'ARGIs_I3TkCoQIIfCAKfQA',
    'version': {'created': '8040399'}}}}}

The index we created is composed of `3` shards and `1` replica.   

When searching , ES queries each shard independantly and combines it

In [21]:
len(dset)

24995

In [22]:
df_subset = dset.to_pandas()
number_of_docs = len(df_subset)

Bulk insert all of our documents

In [23]:
df_subset

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,description_final,image
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,2014-09-29 00:08:38.594364,t,4272,2848,1.50,Woman exploring a forest,michellespencer77,...,2375421,6967,woman walking in the middle of forest,,,,,L56bVcRRIWMh.gVunlS4SMbsRRxr,Woman exploring a forest,"{'bytes': None, 'path': '../data/raw/images/XM..."
1,rDLBArZUl1c,https://unsplash.com/photos/rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,2014-11-18 19:36:57.08945,t,3000,4000,0.75,Succulents in a terrarium,ugmonk,...,13784815,82141,succulent plants in clear glass terrarium,,,,,LvI$4txu%2s:_4t6WUj]xat7RPoe,Succulents in a terrarium,"{'bytes': None, 'path': '../data/raw/images/rD..."
2,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2015-01-01 20:02:02.097036,t,2564,1710,1.50,Rural winter mountainside,johnprice,...,1302461,3428,rocky mountain under gray sky at daytime,,,,,LhMj%NxvM{t7_4t7aeoM%2M{ozj[,Rural winter mountainside,"{'bytes': None, 'path': '../data/raw/images/cN..."
3,iuZ_D1eoq9k,https://unsplash.com/photos/iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,2014-11-01 20:15:13.410073,t,2912,4368,0.67,Poppy seeds and flowers,krisatomic,...,2890238,33704,red common poppy flower selective focus phography,,,,,LSC7DirZAsX7}Br@GEWWmnoLWCnj,Poppy seeds and flowers,"{'bytes': None, 'path': '../data/raw/images/iu..."
4,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,2014-11-26 13:13:50.134383,t,4896,3264,1.50,Silhouette near dark trees,jonaseriksson,...,8704860,49662,trees during night time,,,,,L25|_:V@0hxtI=W;odae0ht6=^NG,Silhouette near dark trees,"{'bytes': None, 'path': '../data/raw/images/Be..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24990,c7OrOMxrurA,https://unsplash.com/photos/c7OrOMxrurA,https://images.unsplash.com/photo-159300793778...,2020-06-24 14:12:32.397564,t,4160,6240,0.67,,andyadcon,...,1603469,4757,black metal fence during daytime,,,,,L34d_aJ-I:R*tlxGWUjY1y$i$hsm,black metal fence during daytime,"{'bytes': None, 'path': '../data/raw/images/c7..."
24991,15IuQ5a0Qwg,https://unsplash.com/photos/15IuQ5a0Qwg,https://images.unsplash.com/photo-159296761254...,2020-06-24 03:00:42.603563,t,6000,4000,1.50,Pearl earrings and seashells,contentpixie,...,550016,2544,white and brown seashell on white surface,,,,,LAM%_?_NNIH?xvRPx]kBajRPWAxv,Pearl earrings and seashells,"{'bytes': None, 'path': '../data/raw/images/15..."
24992,w8nrcXz8pwk,https://unsplash.com/photos/w8nrcXz8pwk,https://images.unsplash.com/photo-159299937329...,2020-06-24 11:53:00.668613,t,2584,4592,0.56,,maur1ts,...,500831,3923,leopard on brown tree trunk during daytime,,,,,LlK1wK00M{%MxvV@x[tRM|oyt8t7,leopard on brown tree trunk during daytime,"{'bytes': None, 'path': '../data/raw/images/w8..."
24993,n1jHrRhehUI,https://unsplash.com/photos/n1jHrRhehUI,https://images.unsplash.com/photo-159192792878...,2020-06-12 02:13:04.409162,t,3533,4824,0.73,Floral truck in the streets of Rome,keithalva,...,335692,1734,woman in beige coat and white hat standing on ...,,,,,LOIhKfV@0J%N~WM{sT-=g4M{Mxx],Floral truck in the streets of Rome,"{'bytes': None, 'path': '../data/raw/images/n1..."


In [24]:
df_subset.iloc[0].to_dict()

{'photo_id': 'XMyPniM9LF0',
 'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',
 'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',
 'photo_submitted_at': '2014-09-29 00:08:38.594364',
 'photo_featured': 't',
 'photo_width': 4272,
 'photo_height': 2848,
 'photo_aspect_ratio': 1.5,
 'photo_description': 'Woman exploring a forest',
 'photographer_username': 'michellespencer77',
 'photographer_first_name': 'Michelle',
 'photographer_last_name': 'Spencer',
 'exif_camera_make': 'Canon',
 'exif_camera_model': 'Canon EOS REBEL T3',
 'exif_iso': 400.0,
 'exif_aperture_value': '1.8',
 'exif_focal_length': '50.0',
 'exif_exposure_time': '1/100',
 'photo_location_name': None,
 'photo_location_latitude': nan,
 'photo_location_longitude': nan,
 'photo_location_country': None,
 'photo_location_city': None,
 'stats_views': 2375421,
 'stats_downloads': 6967,
 'ai_description': 'woman walking in the middle of forest',
 'ai_primary_landmark_name': None,
 'ai_pr

In [25]:
with tqdm.auto.tqdm(total=number_of_docs , unit="docs" ) as pbar:
    successes = 0


    for ok, action in streaming_bulk(
            client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
        ):
        pbar.update(1)
        successes += ok


  0%|          | 0/24995 [00:00<?, ?docs/s]

Inserting `20k` documents at `3000` docs/sec on a single node is pretty good

In [26]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/_cat/shards/{ELASTIC_INDEX}?v=true").content.decode()
    
)

In [28]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/_cat/nodes?v=true").content.decode()
    
)

In [29]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/{ELASTIC_INDEX}/_mapping").json()
    
)

note that by default, the default schema for text content stored content as full text and keywords.      
It is ignored as keyword, if the length is greater than 256 tokens

[ignore_above reference](https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html)

## Evaluate

In [30]:
?client.get

[0;31mSignature:[0m
[0mclient[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mid[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merror_trace[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilter_path[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0;34m...[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhuman[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpreference[0m[0;34m:

getting a specific document by their id

In [32]:
resp = client.get(index=ELASTIC_INDEX, id="XMyPniM9LF0")
resp.body

{'_index': 'unsplash',
 '_id': 'XMyPniM9LF0',
 '_version': 1,
 '_seq_no': 0,
 '_primary_term': 1,
 'found': True,
 '_source': {'photo_id': 'XMyPniM9LF0',
  'photo_url': 'https://unsplash.com/photos/XMyPniM9LF0',
  'photo_image_url': 'https://images.unsplash.com/uploads/14119492946973137ce46/f1f2ebf3',
  'photo_submitted_at': '2014-09-29 00:08:38.594364',
  'photo_featured': 't',
  'photo_width': 4272,
  'photo_height': 2848,
  'photo_aspect_ratio': 1.5,
  'photo_description': 'Woman exploring a forest',
  'photographer_username': 'michellespencer77',
  'photographer_first_name': 'Michelle',
  'photographer_last_name': 'Spencer',
  'exif_camera_make': 'Canon',
  'exif_camera_model': 'Canon EOS REBEL T3',
  'exif_iso': 400.0,
  'exif_aperture_value': '1.8',
  'exif_focal_length': '50.0',
  'exif_exposure_time': '1/100',
  'stats_views': 2375421,
  'stats_downloads': 6967,
  'ai_description': 'woman walking in the middle of forest',
  'blur_hash': 'L56bVcRRIWMh.gVunlS4SMbsRRxr',
  'descri

retrieve a document with a query

In [33]:
query = "Two dogs playing in the snow"

In [34]:
# https://stackoverflow.com/questions/34147471/elasticsearch-how-to-search-for-a-value-in-any-field-across-all-types-in-one


resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                # "fields": ["Title", "QuestionBody"],
                            }
            }
    , size=5
    , explain=False
)

In [35]:
JSON(resp.body, expanded = True)

<IPython.core.display.JSON object>

### Explain the score

In [38]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    , explain=True
    , source = ["description_final"]
)

In [40]:
JSON (resp.body , expanded=True)

#print ( json.dumps(resp.body, indent=2) )

<IPython.core.display.JSON object>

in the `hits.hits['idx']['_expanation']` , we see individual score computed for each of the components that make BM25
```
weight(Title:pandas in 35543) [PerFieldSimilarity], result of:"
```

In [41]:
# resp = client.search(
#     query = {
#         "bool" : {
#           "must" : {
#             "multi_match" : { "query" : query, "fields": ["Title"] }
#           },
#           "filter": {
#             "term" : { "_id" : "55047745" }
#           }
#         }
#       }

#     , size=2
#     , explain=True
#     , source = ["Title"]
# )

### Distributed tf-idf

we are running an elastic search cluster with three shards.

ES has two ways to compute the distributed term frequencies



`query_then_fetch`     
(Default) Distributed term frequencies are calculated locally for each shard running the search.    

We recommend this option for faster searches with potentially less accurate scoring.

`dfs_query_then_fetch`    
Distributed term frequencies are calculated globally, using information gathered from all shards running the search.   
While this option increases the accuracy of scoring, it adds a round-trip to each shard, which can result in slower searches.

taken from ES [docs](https://www.elastic.co/guide/en/elasticsearch/reference/8.4/search-search.html)

searching with the default mode

In [42]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    #, explain=True
    , source = ["description_final"]
    , search_type = "query_then_fetch"
)

In [43]:
JSON (resp.body , expanded=True)


<IPython.core.display.JSON object>

searching with the global dfs mode

In [44]:
resp = client.search(
    query = {
            "multi_match": {
                "query": query,
                 "fields": ["description_final"],
                            }
            }
    , size=2
    #, explain=True
    , source = ["description_final"]
    , search_type = "dfs_query_then_fetch"
)

In [45]:
JSON (resp.body , expanded=True)


<IPython.core.display.JSON object>

score difference between the different search types

before: 14.775831, 13.5637
    
after:  14.706409, 13.708656

##### Lets go over the method below. It gets the search query and k value that is the recall limit.
- Stop, stem and tokenize the query
- Get bm25 scores of the documents
- Sort the documents by bm25 scores and get top k

In [62]:
def find_results(query:str , k =5):
    
    
    top_items = fetch_results(client,query=query,num_hits=k)
    
    

    display(HTML(f"<h1>Query: {query} </h1>"))
    
    # Iterate over the top k results
    for hit in top_items['hits']['hits']:
        doc_id = hit['_id']
        
        photo_data = hit["_source"]
        
        # Display the photo
        display(Image(url=photo_data["photo_image_url"] + "?w=200"))

        # Display the attribution text
        display(HTML(f"""
                     Photo title: {photo_data["description_final"]}   <br/>
                     Photo by <a href="https://unsplash.com/@{photo_data["photographer_username"]}?utm_source=SearchWorkshop&utm_medium=referral">{photo_data["photographer_first_name"]} {photo_data["photographer_last_name"]}</a> on <a href="https://unsplash.com/?utm_source=SearchWorkshop&utm_medium=referral">Unsplash</a> <br/>
                     Distance: {hit['_score']}
                     """
                                        ))
        print()

In [63]:
find_results( "Two dogs playing in the snow")
















In [None]:
find_results( "boy and girl on a beach")

In [None]:
find_results( "image of a man in a desert")

In [64]:
%%timeit
search_query = "Two dogs playing in the snow"
k =5 
top_items = fetch_results(client,query=query,num_hits=k)



4.65 ms ± 780 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
