# Enhancing Search Engine Relavance using video subtitles (Using Rags)

**Steps**
1. Load a Document
2. Split it into Chunks
3. Create vectors for each chunk and save them in a vector store.

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm import tqdm

## 1. Load a Document

In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import random
import csv

csv.field_size_limit(2**30)  

# Load the data
loader = CSVLoader(file_path='data/subtitles.csv')
documents = loader.load()

# Sample 30% of the data
documents = random.sample(documents, int(0.3 * len(documents)))

In [4]:
print("Type of loaded data:", type(documents))

print("Number of datapoints:", len(documents))

print("Type of each datapoints:", type(documents[0]))

Type of loaded data: <class 'list'>
Number of datapoints: 24749
Type of each datapoints: <class 'langchain_core.documents.base.Document'>


In [5]:
# documents[1]

In [6]:
# print(documents[0].page_content[0:500])

In [7]:
# print(documents[0].metadata)

In [8]:
from langchain_core.documents import Document
import re
from tqdm import tqdm

# Function to extract the relevant metadata from the page_content
def extract_metadata(document, doc_id):
    page_content = document.page_content

    # Function to safely extract matches or return empty string if not found
    def safe_extract(pattern, content):
        match = re.search(pattern, content)
        return match.group(1) if match else ""

    # Extracting each field safely
    name = safe_extract(r'name:\s*(.*)', page_content)
    season = safe_extract(r'season:\s*(.*)', page_content)
    year = safe_extract(r'year:\s*(\d{4})', page_content)
    episode = safe_extract(r'episode:\s*(.*)', page_content)

    # Extract the subtitles (everything else) and clean it
    subtitles = safe_extract(r'subtitles:\s*(.*)', page_content)

    # Creating new metadata dictionary
    metadata = {
        'name': name,
        'season': season,
        'year': year,
        'episode': episode,
    }


    # Creating new document with ID, updated metadata, and page content
    new_document = Document(
        page_content=subtitles.strip(),  # Keeping only the subtitles text in page_content
        metadata=metadata
    )
    
    return new_document

# Process all documents, extract metadata, and assign an id from 0 to len(documents)-1
data = []
for i, doc in enumerate(tqdm(documents, desc="Extracting metadata and assigning IDs")):
    new_doc = extract_metadata(doc, i)
    data.append(new_doc)


Extracting metadata and assigning IDs: 100%|██████████| 24749/24749 [00:00<00:00, 70753.09it/s]


In [11]:
# Checking the result
print(data[100].metadata)
print(data[100].page_content[:500])  # Print first 100 chars of subtitles

{'name': 'northern exposure', 'season': '04', 'year': '1992', 'episode': '06'}
script info title default file scripttype v . wrapstyle playresx playresy scaledborderandshadow yes audio file video file video aspect ratio video zoom video position v style format name fontname fontsize primarycolour secondarycolour outlinecolour backcolour bold italic underline strikeout scalex scaley spacing angle borderstyle outline shadow alignment marginl marginr marginv encoding style dialogue tahoma h fdfdfd h ff h f hc . . style dialogue tahoma h fdfdfd h ff h f hc . . style dialogue t


In [8]:
# data[0]

In [9]:
# print(data[0].page_content[0:500])

In [10]:
# print(data[0].metadata)

In [11]:
# print(data[0].ids)

In [12]:
print('Meta data in first document: ')
print(data[0].metadata)
print()
print('Subtitles in First document: ')
print(data[0].page_content[0:500])

Meta data in first document: 
{'name': 'sexify', 'season': '02', 'year': '2023', 'episode': '04'}

Subtitles in First document: 
we had a meeting yesterday with the people who run sexiguy . you stupid bastard you stole our idea pathetic thief you haven t got any original thought you are disgusting you re disgusting awful and disgusting it wa a very productive talk that we were able to have with them and well we were able to explain our rationale to them . and we also took the chance to hear them out . what did i steal from you you be quiet we re at my place . i ll say what i want be quiet be quiet exhales the craziest thi


## 2. Split it into Chunks

In [13]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
# chunks = text_splitter.split_documents(documents)

from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

chunks = []
for doc in tqdm(data, desc=" Splitting data", unit="data"):
    chunks.extend(text_splitter.split_documents([doc]))


Splitting data: 100%|██████████| 24749/24749 [01:14<00:00, 330.18data/s]

In [14]:
print("Number of Documents:", len(documents))
print()
print("Total number of documents inside list:", len(chunks))
print()
print("Type of variable:", type(chunks))
print()
print("Type of each object inside the list:", type(chunks[0]))
print()
print("Total number of documents inside list:", len(chunks))
print()
print("Content of first chunk:")
print(chunks[0])

Number of Documents: 24749

Total number of documents inside list: 2751797

Type of variable: <class 'list'>

Type of each object inside the list: <class 'langchain_core.documents.base.Document'>

Total number of documents inside list: 2751797

Content of first chunk:
page_content='we had a meeting yesterday with the people who run sexiguy . you stupid bastard you stole our idea pathetic thief you haven t got any original thought you are disgusting you re disgusting awful and disgusting it wa a very productive talk that we were able to have with them and well we were able to' metadata={'name': 'sexify', 'season': '02', 'year': '2023', 'episode': '04'}


In [15]:
# chunks[0].page_content

In [16]:
type(chunks[0])

langchain_core.documents.base.Document

In [17]:
# type(chunks)

In [18]:
type(chunks[0].metadata)

dict

In [19]:
len(chunks)

2751797

In [20]:
# chunks[0]

## 3. Create vectors for each chunk and save them in a vector store.

In [3]:
f = open('keys/openai_key.txt')

OPENAI_API_KEY = f.read()

In [4]:
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.embeddings.base import Embeddings

# Define an embedding class to wrap SentenceTransformer
class SentenceTransformerEmbedding(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        """Generate embeddings for multiple documents."""
        return self.model.encode(texts, convert_to_tensor=False).tolist()

    def embed_query(self, text):
        """Generate an embedding for a single query."""
        return self.model.encode([text], convert_to_tensor=False)[0].tolist()

# Create the embedding function
embedding_function = SentenceTransformerEmbedding('all-MiniLM-L6-v2')



In [5]:
# pip install langchain_chroma

In [6]:
# Initialize ChromaDB with the embedding function
from langchain_chroma import Chroma

db = Chroma(
    collection_name="vector_database",
    embedding_function=embedding_function,
    persist_directory="./chroma_db_"
)

In [24]:
# Define batch size for processing
batch_size = 100  # Adjust based on performance needs

# Ensure chunks is a list of LangChain Document objects
assert isinstance(chunks, list) and all(isinstance(chunk, Document) for chunk in chunks), \
    "Chunks must be a list of LangChain Document objects"

# Adding documents
for i in tqdm(range(0, len(chunks), batch_size), desc="Adding Documents"):
    batch = chunks[i:i + batch_size]
    db.add_documents(batch)

print("Documents successfully added to the collection.")

Adding Documents: 100%|██████████| 27518/27518 [6:00:30<00:00,  1.27it/s]     

Documents successfully added to the collection.





Building an End-to-End RAG Chain
- Step 1: Initialize an embedding_model
- Step 2: Initialize the Chroma DB Connection
- Step 3: Create a Retriever Object
- Step 4: Initialize a Chat Prompt Template
- Step 5: Initialize a Generator (i.e. Chat Model)
- Step 6: Initialize a Output Parser
- Step 7: Define a RAG Chain
- Step 8: Invoke the Chain

# **Retrievers**

## Step 1 - Initialize an embedding_model

In [1]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings.base import Embeddings

# Define an embedding class to wrap SentenceTransformer
class SentenceTransformerEmbedding(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        """Generate embeddings for multiple documents."""
        return self.model.encode(texts, convert_to_tensor=False).tolist()

    def embed_query(self, text):
        """Generate an embedding for a single query."""
        return self.model.encode([text], convert_to_tensor=False)[0].tolist()

# Create the embedding function
embedding_function = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

## Step 2 - Initialize a ChromaDB Connection

In [4]:
from langchain_chroma import Chroma

# Initialize the database connection
# If database exist, it will connect with the collection_name and persist_directory
# Otherwise a new collection will be created
db = Chroma(collection_name = "vector_database", 
            embedding_function = embedding_function, 
            persist_directory = "./chroma_db_")

In [5]:
db

<langchain_chroma.vectorstores.Chroma at 0x17e6bf410>

## Step 3: Create a Retriever Object

In [6]:

# Converting CHROMA db connection to Retriever Object
retriever = db.as_retriever(search_type = "similarity",
                            search_kwargs = {"k": 5})

print(type(retriever))

<class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


## Step 4: Initialize a Chat Prompt Template

In [7]:

from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
Answer the question based solely on the following context:
{context}

Based on the given subtitle/dialogues below:
{question}

Provide only the title of the movie.
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

## Step 5: Initialize a Generator (i.e. Chat Model)

In [9]:
f = open('keys/openai_key.txt')
OPENAI_API_KEY = f.read()

In [10]:
# Import OpenAI ChatModel
from langchain_openai import ChatOpenAI

# Set the OpenAI Key and initialize a ChatModel
chat_model = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

## Step 6: Initialize a Output Parser

In [11]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

## Step 7: Define a RAG Chain

In [12]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt_template | chat_model | parser

## Step 8: Invoke the Chain

In [46]:
query = """Big man in a suit of armor, take that off what are you?, 
genius billionaire playboy philanthropist, 
I know guys with none of that worth 10 of you, I’ve seen the footage the only thing you really fight for is yourself, 
you’re not the guy to make the sacrifice play to lay down on a wire or let the other guy crawl over you! 
I think I would just cut the wire.  
Always a way out, you know you may not be a threat but you stop pretending to be a hero! 
A hero like you, you’re a laboratory experiment rogers everything special about came out of a bottle"""

# transcription = client.audio.transcriptions.create(
#     model="whisper-1", 
#     file=open('data/query.wav', "rb"))

# query = transcription.text

In [47]:
%%time

result = rag_chain.invoke(query)

print(result)

Avengers: Endgame
CPU times: user 26.4 ms, sys: 43.6 ms, total: 70 ms
Wall time: 636 ms


## **Testing**

In [26]:
ground_truth_data_long_dialogues = [
    {
        "query": """Big man in a suit of armor, take that off what are you?,
genius billionaire playboy philanthropist,
I know guys with none of that worth 10 of you, I’ve seen the footage the only thing you really fight for is yourself,
you’re not the guy to make the sacrifice play to lay down on a wire or let the other guy crawl over you!
I think I would just cut the wire.
Always a way out, you know you may not be a threat but you stop pretending to be a hero!
A hero like you, you’re a laboratory experiment rogers everything special about came out of a bottle""",
        "expected_title": "The Avengers" # Your example
    },
    {
        "query": """Don't talk like one of them, you're not! Even if you'd like to be. To them, you're just a freak, like me! They need you right now, but when they don't, they'll cast you out, like a leper! See, their morals, their code... it's a bad joke. Dropped at the first sign of trouble. They're only as good as the world allows them to be. I'll show you. When the chips are down, these... these civilized people? They'll eat each other. See, I'm not a monster. I'm just ahead of the curve.""",
        "expected_title": "The Dark Knight"
    },
    {
        "query": """The Matrix is everywhere. It is all around us. Even now, in this very room. You can see it when you look out your window or when you turn on your television. You can feel it when you go to work... when you go to church... when you pay your taxes. It is the world that has been pulled over your eyes to blind you from the truth. That you are a slave, Neo. Like everyone else, you were born into bondage. Born into a prison that you cannot smell or taste or touch. A prison for your mind.""",
        "expected_title": "The Matrix"
    },
    {
        "query": """There's a passage I got memorized. Ezekiel 25:17. 'The path of the righteous man is beset on all sides by the inequities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.'""",
        "expected_title": "Pulp Fiction"
    },
    {
        "query": """You're not your job. You're not how much money you have in the bank. You're not the car you drive. You're not the contents of your wallet. You're not your f*cking khakis. We're the middle children of history, man. No purpose or place. We have no Great War. No Great Depression. Our Great War's a spiritual war... our Great Depression is our lives.""",
        "expected_title": "Fight Club"
    },
    {
        "query": """It's an energy field created by all living things. It surrounds us and penetrates us; it binds the galaxy together. A Jedi can feel the Force flowing through him. It controls your actions, but it also obeys your commands. You will learn to use it, just as your father did.""", # Adapted from Obi-Wan's explanation
        "expected_title": "Star Wars: Episode IV - A New Hope"
    }
]

In [28]:
from tqdm import tqdm 

results = []

for item in tqdm(ground_truth_data_long_dialogues, desc="Evaluating RAG chain"):
    query = item['query']
    expected_title = item['expected_title']

    try:
        predicted_title = rag_chain.invoke(query)
    except Exception as e:
        predicted_title = f"Error: {str(e)}" # Handle potential errors during invocation

    results.append({
        'query': query,
        'expected_title': expected_title,
        'predicted_title': predicted_title
    })

# Convert results to a Pandas DataFrame for easier analysis
results_df = pd.DataFrame(results)

Evaluating RAG chain: 100%|██████████| 6/6 [00:02<00:00,  2.07it/s]


In [29]:
results_df.head()

Unnamed: 0,query,expected_title,predicted_title
0,"Big man in a suit of armor, take that off what...",The Avengers,Iron Man 2
1,"Don't talk like one of them, you're not! Even ...",The Dark Knight,The Dark Knight
2,The Matrix is everywhere. It is all around us....,The Matrix,The Matrix
3,There's a passage I got memorized. Ezekiel 25:...,Pulp Fiction,Pulp Fiction
4,You're not your job. You're not how much money...,Fight Club,Fight Club


In [32]:
def normalize_title(title):
    if not isinstance(title, str):
        return "" 
    title = title.lower().strip()
    return title

results_df['normalized_expected_title'] = results_df['expected_title'].apply(normalize_title)
results_df['normalized_predicted_title'] = results_df['predicted_title'].apply(normalize_title)

results_df['is_correct'] = (results_df['normalized_predicted_title'] == results_df['normalized_expected_title'])

results_df[['normalized_expected_title', 'normalized_predicted_title', 'is_correct']].head()

Unnamed: 0,normalized_expected_title,normalized_predicted_title,is_correct
0,the avengers,iron man 2,False
1,the dark knight,the dark knight,True
2,the matrix,the matrix,True
3,pulp fiction,pulp fiction,True
4,fight club,fight club,True


In [33]:
# Calculate Exact Match Accuracy
correct_predictions = results_df['is_correct'].sum()
total_queries = len(results_df)
accuracy = correct_predictions / total_queries

print(f"\nTotal Queries: {total_queries}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Exact Match Accuracy: {accuracy:.2%}")


Total Queries: 6
Correct Predictions: 5
Exact Match Accuracy: 83.33%


In [40]:
accuracy = results_df['is_correct'].sum() / len(results_df)
print(accuracy*100)

83.33333333333334
