## Semantic Search Tutorial

### BM25

In [None]:
!pip install faiss-gpu
!pip install datasets evaluate transformers[sentencepiece]

In [1]:
import torch
from torch import nn
from torch.functional import tensordot
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.nn import CosineEmbeddingLoss
from torch import Tensor

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import DPRContextEncoder
from typing import List, Dict

import os
from tqdm import tqdm
import numpy as np
import json
import pandas as pd



2024-07-06 13:54:54.670161: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 13:54:54.693335: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-06 13:54:54.693356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 13:54:54.693947: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-06 13:54:54.697889: I tensorflow/core/platform/cpu_feature_guar

In [2]:
#load the data
df = pd.read_csv("imdb_top_10k.csv")
# df = df[['Movie Name', 'Genre', 'Plot', 'Directors']]
# df = df.dropna()
df.head()

Unnamed: 0,ID,Movie Name,Rating,Runtime,Genre,Metascore,Plot,Directors,Stars,Votes,Gross,Link
0,1,The Shawshank Redemption,9.3,142 min,Drama,82.0,"Over the course of several years, two convicts...","['Frank Darabont', 'Tim Robbins', 'Morgan Free...","['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...",2752419,28341469,https://www.imdb.com/title/tt0111161/
1,2,The Godfather,9.2,175 min,"Crime, Drama",100.0,"Don Vito Corleone, head of a mafia family, dec...","['Francis Ford Coppola', 'Marlon Brando', 'Al ...","['Marlon Brando', 'Al Pacino', 'James Caan', '...",1914751,134966411,https://www.imdb.com/title/tt0068646/
2,3,Ramayana: The Legend of Prince Rama,9.2,135 min,"Animation, Action, Adventure",,An anime adaptation of the Hindu epic the Rama...,"['Ram Mohan', 'Yûgô Sakô', 'Koichi Saski', 'Ar...","['Yûgô Sakô', 'Koichi Saski', 'Arun Govil', 'N...",10108,10108,https://www.imdb.com/title/tt0259534/
3,4,The Chaos Class,9.2,87 min,"Comedy, Drama",,"Lazy, uneducated students share a very close b...","['Ertem Egilmez', 'Kemal Sunal', 'Münir Özkul'...","['Kemal Sunal', 'Münir Özkul', 'Halit Akçatepe...",41785,41785,https://www.imdb.com/title/tt0252487/
4,5,Daman,9.1,121 min,"Adventure, Drama",,"The film is set in 2015. Sid, is a young docto...","['Lenka Debiprasad', 'Vishal Mourya', 'Karan K...","['Vishal Mourya', 'Karan Kandhapan', 'Babushan...",13303,13303,https://www.imdb.com/title/tt17592606/


In [3]:
print(df.shape)

(9999, 12)


In [4]:
def get_contexts(df: pd.DataFrame) -> List[Dict]:
    contexts = []
    for i, row in df.iterrows():
        contexts.append({
            'title': row['Movie Name'],
            'text': row['Plot'],
            'meta': {
                'genre': row['Genre'],
                'director': row['Directors'],
                'votes': row['Votes'],
                'rating': row['Rating'],
                'metascore': row['Metascore'],
            }
        })
    return contexts

contexts = get_contexts(df)

### Reference Implementation of BM25

In [5]:
from rank_bm25 import BM25Okapi

class BM25Search:
    def __init__(self, documents: List[str]):
        self.documents = documents
        self.tokenized_documents = [document.split() for document in documents]
        self.bm25 = BM25Okapi(self.tokenized_documents)
    
    def search(self, query: str, top_k: int = 5) -> List[int]:
        tokenized_query = query.split()
        doc_scores = self.bm25.get_scores(tokenized_query)
        # print(doc_scores)
        sorted_indices = np.argsort(doc_scores)[::-1]
        return sorted_indices[:top_k]
    
bm25_search = BM25Search([context['text'] for context in contexts])
query = "Batman"
retrieved_indices = bm25_search.search(query)
print(retrieved_indices)
for i in retrieved_indices:
    print(contexts[i]['title'], contexts[i]['text'])
    print()

[ 991 5220  855 6344 8178]
Batman: Mask of the Phantasm Batman is wrongly implicated in a series of murders of mob bosses actually done by a new vigilante assassin.

Batman: Gotham by Gaslight In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigates a new series of murders by Jack the Ripper.

The Batman When a sadistic serial killer begins murdering key political figures in Gotham, Batman is forced to investigate the city's hidden corruption and question his family's involvement.

Batman: The Killing Joke As Batman hunts for the escaped Joker, the Clown Prince of Crime attacks the Gordon family to prove a diabolical point mirroring his own fall into madness.

Batman and Harley Quinn Batman and Nightwing are forced to team with the Joker's sometimes-girlfriend Harley Quinn to stop a global threat brought about by Poison Ivy and Jason Woodrue, the Floronic Man.



You can use the retriever in your favorite package for RAG as well, i.e. LangChain, etc.

In [6]:
from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_texts([context['text'] for context in contexts])
retrieved_indices = retriever.invoke(query)
retrieved_indices

[Document(page_content='Batman is wrongly implicated in a series of murders of mob bosses actually done by a new vigilante assassin.'),
 Document(page_content='In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigates a new series of murders by Jack the Ripper.'),
 Document(page_content="When a sadistic serial killer begins murdering key political figures in Gotham, Batman is forced to investigate the city's hidden corruption and question his family's involvement."),
 Document(page_content='As Batman hunts for the escaped Joker, the Clown Prince of Crime attacks the Gordon family to prove a diabolical point mirroring his own fall into madness.')]

### BM25 from scratch

In [7]:
# def term_freq(term: str, document: str) -> int:
#     #TODO implement this function
#     ...

# def inverse_doc_freq(term: str, documents: List[str]) -> float:
#     #TODO implement this function
#     ...

def bm25(term: str, document: str, documents: List[str], k1: float = 1.5, b: float = 0.75) -> float:
    #TODO implement this function
    ...


def bm25_similarity(query: str, document: str, documents: List[str]) -> float:
    query_terms = query.split()
    return sum(bm25(term, document, documents) for term in query_terms)

def get_bm25_topk(query: str, documents: List[str], k: int = 5) -> List[int]:
    scores = [bm25_similarity(query, document, documents) for document in documents]
    return sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]

#test
retrieved_indices = get_bm25_topk(query, [context['text'] for context in contexts])
for i in retrieved_indices:
    print(contexts[i]['title'], contexts[i]['text'])
    print()

Batman: Mask of the Phantasm Batman is wrongly implicated in a series of murders of mob bosses actually done by a new vigilante assassin.

Batman: Gotham by Gaslight In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigates a new series of murders by Jack the Ripper.

The Batman When a sadistic serial killer begins murdering key political figures in Gotham, Batman is forced to investigate the city's hidden corruption and question his family's involvement.

Batman: The Killing Joke As Batman hunts for the escaped Joker, the Clown Prince of Crime attacks the Gordon family to prove a diabolical point mirroring his own fall into madness.

Batman v Superman: Dawn of Justice Fearing that the actions of Superman are left unchecked, Batman takes on the Man of Steel, while the world wrestles with what kind of a hero it really needs.



### Neural Retrieval

Lets make it a huggingface dataset out of pure convenience

In [7]:
from datasets import Dataset

movie_dataset = Dataset.from_pandas(df)
movie_dataset

Dataset({
    features: ['ID', 'Movie Name', 'Rating', 'Runtime', 'Genre', 'Metascore', 'Plot', 'Directors', 'Stars', 'Votes', 'Gross', 'Link'],
    num_rows: 9999
})

In [8]:
def concatenate_text(data):
    
    return {"text": data['Movie Name'] + " \n" + data['Genre'] + " \n" + data['Plot'] + " \n" + data['Directors'] + " \n" + str(data['Votes']) + " \n" + str(data['Rating']) + " \n" + str(data['Metascore'])}

movie_dataset = movie_dataset.map(concatenate_text)

Map:   0%|          | 0/9999 [00:00<?, ? examples/s]

In [9]:
movie_dataset[1]

{'ID': 2,
 'Movie Name': 'The Godfather',
 'Rating': 9.2,
 'Runtime': '175 min',
 'Genre': 'Crime, Drama',
 'Metascore': 100.0,
 'Plot': 'Don Vito Corleone, head of a mafia family, decides to hand over his empire to his youngest son Michael. However, his decision unintentionally puts the lives of his loved ones in grave danger.',
 'Directors': "['Francis Ford Coppola', 'Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton']",
 'Stars': "['Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton']",
 'Votes': 1914751,
 'Gross': 134966411,
 'Link': 'https://www.imdb.com/title/tt0068646/',
 'text': "The Godfather \nCrime, Drama \nDon Vito Corleone, head of a mafia family, decides to hand over his empire to his youngest son Michael. However, his decision unintentionally puts the lives of his loved ones in grave danger. \n['Francis Ford Coppola', 'Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton'] \n1914751 \n9.2 \n100.0"}

In [10]:
class Transformer_embedder(nn.Module):
    def __init__(self, feat_extractor_name: str = ''):
        """Transformer Embedding model

        Args:
            feat_extractor_name (str, optional): Name of the feature extracator from HF hub or torch Hub.
        """        
        super(Transformer_embedder, self).__init__()
        

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.feat_extractor_name = feat_extractor_name

        if 'dpr' in feat_extractor_name.lower():
            feat_extractor = DPRContextEncoder.from_pretrained(feat_extractor_name)
        else:
            feat_extractor = AutoModel.from_pretrained(feat_extractor_name)
            
        self.tokenizer = AutoTokenizer.from_pretrained(feat_extractor_name)

        
        self.normalize = True
        self.feat_extractor = feat_extractor
        self.embeding_shape = self.get_extractor_output_shape() 
                            

    def get_extractor_output_shape(self):
        last_layer = list(self.feat_extractor.named_children())[-1]

        if hasattr( list(last_layer[1].modules())[1] , 'out_features'):
            shape = list(last_layer[1].modules())[1].out_features
        else:
            shape = self.feat_extractor.config.hidden_size

        return shape
    
    def pool(self, embedding:Tensor, attention_mask:Tensor, pool_type:str = 'mean'):
        #TODO implement this function
        ...


    def __call__(self, input_ids:Tensor, attention_mask:Tensor, labels: Tensor = None, **kwargs):

        #TODO implement this function
        ...
    


In [9]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

#The following is a bigger model and might require slight modification in the code
# follow this link for more details: https://huggingface.co/intfloat/e5-mistral-7b-instruct
# model_ckpt = "intfloat/e5-mistral-7b-instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedder = Transformer_embedder(model_ckpt)
embedder = embedder.to(device)

def get_embeddings(text_list):
    encoded_input = embedder.tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    embedder.eval()
    with torch.inference_mode():
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = embedder(**encoded_input)
    return model_output

#the reason to save as numpy is for further FAISS indexing
embeddings_dataset = movie_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).cpu().detach().numpy()[0]}
)

model.safetensors.index.json:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.28G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
#make embeddings numpy array
embeddings_dataset.set_format(type="numpy", columns=["embeddings", "text", "Movie Name", "Genre", "Plot", "Directors", "Votes", "Rating", "Metascore"])

In [14]:
embeddings_dataset[1]["embeddings"].shape

(768,)

In [15]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/10 [00:00<?, ?it/s]

Dataset({
    features: ['ID', 'Movie Name', 'Rating', 'Runtime', 'Genre', 'Metascore', 'Plot', 'Directors', 'Stars', 'Votes', 'Gross', 'Link', 'text', 'embeddings'],
    num_rows: 9999
})

In [16]:
question = "Batman"
question_embedding = get_embeddings([question]).cpu().detach().numpy()[0]
question_embedding.shape

(768,)

In [17]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

#save only the embeddings
embeddings = embeddings_dataset["embeddings"]
np.save("imdb_top_10k_embeddings.npy", embeddings)

#save the dataset
embeddings_dataset.save_to_disk("imdb_top_10k_embeddings_dataset")

In [18]:
samples = {k: v for k, v in samples.items() if k != "embeddings"}

In [19]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)


In [20]:
for _, row in samples_df.iterrows():
    print(f"Series Title: {row['Movie Name']}")
    print(f"Overview: {row['Plot']}")
    print(f"Genre: {row['Genre']}")
    print(f"Scores: {row['scores']}")
    print(f"Votes: {row['Votes']}")
    print(f"Rating: {row['Rating']}")
    print(f"Metascore: {row['Metascore']}")
    print(f"Directors: {row['Directors']}")
    print("=" * 50)
    print()

Series Title: Batman: The Movie
Overview: The Dynamic Duo faces four supervillains who plan to hold the world for ransom with the help of a secret invention that instantly dehydrates people.
Genre: Action, Adventure, Comedy
Scores: 0.782235860824585
Votes: 34572
Rating: 6.5
Metascore: 71.0
Directors: ['Leslie H. Martinson', 'Adam West', 'Burt Ward', 'Lee Meriwether', 'Cesar Romero']

Series Title: Batman Returns
Overview: While Batman deals with a deformed man calling himself the Penguin wreaking havoc across Gotham with the help of a cruel businessman, a female employee of the latter becomes the Catwoman with her own vendetta.
Genre: Action, Crime, Fantasy
Scores: 0.7648836374282837
Votes: 314554
Rating: 7.099999904632568
Metascore: 68.0
Directors: ['Tim Burton', 'Michael Keaton', 'Danny DeVito', 'Michelle Pfeiffer', 'Christopher Walken']

Series Title: Batman: Gotham by Gaslight
Overview: In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigate