In [72]:
import pandas as pd

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm import tqdm


## 1. Loading the document

In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import random
import csv

csv.field_size_limit(2**30)  

# Load the data
loader = CSVLoader(file_path='data/subtitles_metadata.csv')
documents = loader.load()

# Sample 20% of the data
documents = random.sample(documents, int(0.20 * len(documents)))

In [4]:
print("Type of loaded data:", type(documents))

print("Number of datapoints:", len(documents))

print("Type of each datapoints:", type(documents[0]))

Type of loaded data: <class 'list'>
Number of datapoints: 16499
Type of each datapoints: <class 'langchain_core.documents.base.Document'>


In [8]:
# documents[0]

In [9]:
print(documents[0].page_content[0:500])

movie_id: 9263779
name: inside man
season: 01
episode: 04
year: 2022
subtitles: this programme contains strong language some violent scene and some scene which some viewer may find upsetting from the start . if i tell you to think of a red bus you just do don t you an image of a red bus just appears in your head right i suppose so yeah so what but instead if i tell you not to think of a blue bicycle you still picture a blue bicycle right yeah . ok. even though i specifically told you not to . is


In [10]:
print(documents[0].metadata)

{'source': 'data/subtitles_metadata.csv', 'row': 20528}


In [12]:
from langchain_core.documents import Document
import re
from tqdm import tqdm

# Function to extract the relevant metadata from the page_content
def extract_metadata(document, doc_id):
    page_content = document.page_content

    # Function to safely extract matches or return empty string if not found
    def safe_extract(pattern, content):
        match = re.search(pattern, content)
        return match.group(1) if match else ""

    # Extracting each field safely
    movie_id = safe_extract(r'movie_id:\s*(\d+)', page_content)
    name = safe_extract(r'name:\s*(.*)', page_content)
    season = safe_extract(r'season:\s*(.*)', page_content)
    episode = safe_extract(r'episode:\s*(.*)', page_content)
    year = safe_extract(r'year:\s*(\d{4})', page_content)

    # Extract the subtitles (everything else) and clean it
    subtitles = safe_extract(r'subtitles:\s*(.*)', page_content)

    # Creating new metadata dictionary
    metadata = {
        'movie_id': movie_id,
        'name': name,
        'season': season,
        'episode': episode,
        'year': year,
    }


    # Creating new document with ID, updated metadata, and page content
    new_document = Document(
        page_content=subtitles.strip(),  # Keeping only the subtitles text in page_content
        metadata=metadata
    )
    
    return new_document

# Process all documents, extract metadata, and assign an id from 0 to len(documents)-1
data = []
for i, doc in enumerate(tqdm(documents, desc="Extracting metadata and assigning IDs")):
    new_doc = extract_metadata(doc, i)
    data.append(new_doc)

Extracting metadata and assigning IDs: 100%|██████████| 16499/16499 [00:00<00:00, 66002.86it/s]


In [28]:
# data[0]

In [26]:
print('Meta data in first document: ')
print(data[0].metadata)
print()
print('-'*80)
print()
print('Subtitles in First document: ')
print(data[0].page_content[0:500])

Meta data in first document: 
{'movie_id': '9263779', 'name': 'inside man', 'season': '01', 'episode': '04', 'year': '2022'}

--------------------------------------------------------------------------------

Subtitles in First document: 
this programme contains strong language some violent scene and some scene which some viewer may find upsetting from the start . if i tell you to think of a red bus you just do don t you an image of a red bus just appears in your head right i suppose so yeah so what but instead if i tell you not to think of a blue bicycle you still picture a blue bicycle right yeah . ok. even though i specifically told you not to . is there a point to this yes it s a demonstration of how you think how everyone th


## 2. Split into chunks

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

chunks = []
for doc in tqdm(data, desc=" Splitting data", unit="data"):
    chunks.extend(text_splitter.split_documents([doc]))

 Splitting data: 100%|██████████| 16499/16499 [00:40<00:00, 410.95data/s]


In [32]:
print("Number of Documents:", len(documents))
print()
print("Total number of documents inside list:", len(chunks))
print()
print("Type of variable:", type(chunks))
print()
print("Type of each object inside the list:", type(chunks[0]))
print()
print("Total number of documents inside list:", len(chunks))
print()
print("Content of first chunk:")
print(chunks[0])

Number of Documents: 16499

Total number of documents inside list: 1824632

Type of variable: <class 'list'>

Type of each object inside the list: <class 'langchain_core.documents.base.Document'>

Total number of documents inside list: 1824632

Content of first chunk:
page_content='this programme contains strong language some violent scene and some scene which some viewer may find upsetting from the start . if i tell you to think of a red bus you just do don t you an image of a red bus just appears in your head right i suppose so yeah so what but instead if i tell you not to' metadata={'movie_id': '9263779', 'name': 'inside man', 'season': '01', 'episode': '04', 'year': '2022'}


In [33]:
chunks[0].page_content

'this programme contains strong language some violent scene and some scene which some viewer may find upsetting from the start . if i tell you to think of a red bus you just do don t you an image of a red bus just appears in your head right i suppose so yeah so what but instead if i tell you not to'

In [34]:
type(chunks[0])

langchain_core.documents.base.Document

In [35]:
type(chunks[0].metadata)

dict

In [36]:
len(chunks)

1824632

In [37]:
chunks[0]

Document(metadata={'movie_id': '9263779', 'name': 'inside man', 'season': '01', 'episode': '04', 'year': '2022'}, page_content='this programme contains strong language some violent scene and some scene which some viewer may find upsetting from the start . if i tell you to think of a red bus you just do don t you an image of a red bus just appears in your head right i suppose so yeah so what but instead if i tell you not to')

## 3. Create vectors for rach chunk and save them in a vector store

In [39]:
f = open('keys/openai_api_key.txt')

OPENAI_API_KEY = f.read()

In [47]:
# Step 1: Initialize an embedding model

from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.embeddings.base import Embeddings

# Define an embedding class to wrap SentenceTransformer
class SentenceTransformerEmbedding(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        '''Generate embeddings for multiple documents.'''
        return self.model.encode(texts, convert_to_tensor = False).tolist()

    def embed_query(self, text):
        '''Generate an emedding for a single query.'''
        return self.model.encode([text], convert_to_tensor = False)[0].tolist()

# Create the embedding function
embedding_function = SentenceTransformerEmbedding('all-MiniLM-L6-v2')

In [48]:
# Step 2: Initialize the ChromaDB connection

from langchain_chroma import Chroma

db = Chroma(
    collection_name = 'vector_database',
    embedding_function = embedding_function,
    persist_directory = './chroma_db_'
)

In [49]:
# Define batch size for processing
batch_size = 100 # Adjust based on performance needs

# Ensure chunks is a list of langchain Document objects
assert isinstance(chunks, list) and all(isinstance(chunk, Document) for chunk in chunks), \
    'Chunks must be a list of LangChain Document objects'

# Adding Documents
for i in tqdm(range(0, len(chunks), batch_size), desc = 'Adding Documents'):
    batch = chunks[i : i + batch_size]
    db.add_documents(batch)

print('Documents successfuly added to the collection.')

Adding Documents: 100%|██████████| 18247/18247 [3:32:19<00:00,  1.43it/s]    

Documents successfuly added to the collection.





In [62]:
# Step 3: Create a Retriever Object
# Converting CHROMA db connection to retriever object

retriever = db.as_retriever(
    search_type= 'similarity',
    search_kwargs = {'k': 10}
)

print(type(retriever))

<class 'langchain_core.vectorstores.base.VectorStoreRetriever'>


In [63]:
# Step 4: Initiliaze a Chat Prompt template

from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
Answer the question based solely on the following context:
{context}

Based on the given subtitle/dialogues below:
{question}

Provide only the title of the movie.
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [64]:
# Step 5: Initialize a Generator (i.e., Chat Model)

f = open('keys/openai_api_key.txt')
OPENAI_API_KEY = f.read()

from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(api_key = OPENAI_API_KEY, model = 'gpt-4o-mini')

In [65]:
# Step 6: Initialize a Output Parser

from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [66]:
# Step 7: Define a RAG Chain

from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = {'context': retriever | format_docs, 'question': RunnablePassthrough()} | prompt_template | chat_model | parser

In [77]:
# Step 8: Invoke the Chain

query_1 = '''Big man in a suit of armor, take that off what are you?, 
genius billionaire playboy philanthropist, 
I know guys with none of that worth 10 of you, I’ve seen the footage the only thing you really fight for is yourself, 
you’re not the guy to make the sacrifice play to lay down on a wire or let the other guy crawl over you! 
I think I would just cut the wire.  
Always a way out, you know you may not be a threat but you stop pretending to be a hero! 
A hero like you, you’re a laboratory experiment rogers everything special about came out of a bottle'''

In [78]:
query_2 = '''Don't talk like one of them, you're not! Even if you'd like to be. 
To them, you're just a freak, like me! They need you right now, but when they don't, 
they'll cast you out, like a leper! See, their morals, their code... it's a bad joke. 
Dropped at the first sign of trouble. They're only as good as the world allows them 
to be. I'll show you. When the chips are down, these... these civilized people? 
They'll eat each other. See, I'm not a monster. I'm just ahead of the curve.'''

In [82]:
%%time

result = rag_chain.invoke(query_2)

print(result)

The Dark Knight
CPU times: user 20.7 ms, sys: 43.6 ms, total: 64.3 ms
Wall time: 894 ms


In [83]:
retriever.invoke(query_2)

[Document(id='f2e25d10-a626-4260-a34d-f097e2f49b3d', metadata={'season': 'movie', 'name': 'luther the fallen sun', 'episode': 'movie', 'year': '2023', 'movie_id': '9460292'}, page_content='that to themselves all the ... all the good people by telling themselves that we re the wicked one . yeah . all them monster are telling you all and me that we are monster . okay . good girl . okay . good girl . mum . mum . leave her alone you prick leave her alone ah . now then ... your turn mate'),
 Document(id='d61bd8c8-d3dd-4844-8da8-6df60ee2ea27', metadata={'season': 'movie', 'year': '2023', 'episode': 'movie', 'movie_id': '9460292', 'name': 'luther the fallen sun'}, page_content='that to themselves all the ... all the good people by telling themselves that we re the wicked one . yeah . all them monster are telling you all and me that we are monster . okay . good girl . okay . good girl . mum . mum . leave her alone you prick leave her alone ah . now then ... your turn mate'),
 Document(id='fe63

## Testing

In [70]:
ground_truth_data_long_dialogues = [
    {
        "query": """Big man in a suit of armor, take that off what are you?,
genius billionaire playboy philanthropist,
I know guys with none of that worth 10 of you, I’ve seen the footage the only thing you really fight for is yourself,
you’re not the guy to make the sacrifice play to lay down on a wire or let the other guy crawl over you!
I think I would just cut the wire.
Always a way out, you know you may not be a threat but you stop pretending to be a hero!
A hero like you, you’re a laboratory experiment rogers everything special about came out of a bottle""",
        "expected_title": "The Avengers" # Your example
    },
    {
        "query": """Don't talk like one of them, you're not! Even if you'd like to be. To them, you're just a freak, like me! They need you right now, but when they don't, they'll cast you out, like a leper! See, their morals, their code... it's a bad joke. Dropped at the first sign of trouble. They're only as good as the world allows them to be. I'll show you. When the chips are down, these... these civilized people? They'll eat each other. See, I'm not a monster. I'm just ahead of the curve.""",
        "expected_title": "The Dark Knight"
    },
    {
        "query": """The Matrix is everywhere. It is all around us. Even now, in this very room. You can see it when you look out your window or when you turn on your television. You can feel it when you go to work... when you go to church... when you pay your taxes. It is the world that has been pulled over your eyes to blind you from the truth. That you are a slave, Neo. Like everyone else, you were born into bondage. Born into a prison that you cannot smell or taste or touch. A prison for your mind.""",
        "expected_title": "The Matrix"
    },
    {
        "query": """There's a passage I got memorized. Ezekiel 25:17. 'The path of the righteous man is beset on all sides by the inequities of the selfish and the tyranny of evil men. Blessed is he who, in the name of charity and good will, shepherds the weak through the valley of darkness, for he is truly his brother's keeper and the finder of lost children. And I will strike down upon thee with great vengeance and furious anger those who attempt to poison and destroy My brothers. And you will know My name is the Lord when I lay My vengeance upon thee.'""",
        "expected_title": "Pulp Fiction"
    },
    {
        "query": """You're not your job. You're not how much money you have in the bank. You're not the car you drive. You're not the contents of your wallet. You're not your f*cking khakis. We're the middle children of history, man. No purpose or place. We have no Great War. No Great Depression. Our Great War's a spiritual war... our Great Depression is our lives.""",
        "expected_title": "Fight Club"
    },
    {
        "query": """It's an energy field created by all living things. It surrounds us and penetrates us; it binds the galaxy together. A Jedi can feel the Force flowing through him. It controls your actions, but it also obeys your commands. You will learn to use it, just as your father did.""", # Adapted from Obi-Wan's explanation
        "expected_title": "Star Wars: Episode IV - A New Hope"
    }
]

In [84]:
ground_truth_data_long_dialogues = [
    {
    "query": """I know what it’s like to lose. To feel so desperately that you’re right, yet to fail nonetheless. It’s frightening, turns the legs to jelly. I ask you, to what end? Dread it, run from it, destiny arrives all the same. And now it’s here. Or should I say, I am.""",
    "expected_title": "Avengers: Infinity War"
},
{
    "query": """You could not live with your own failure. Where did that bring you? Back to me. I thought by eliminating half of life, the other half would thrive, but you have shown me that’s impossible. As long as there are those that remember what was, there will always be those that are unable to accept what can be. They will resist.""",
    "expected_title": "Avengers: Endgame"
},
{
    "query": """You think you fight for us? You just fight for yourself. You weren’t there when Ultron nearly tore the world apart. You weren’t there when half the universe vanished. You weren’t there to pick up the pieces. You think your arrogance makes you strong, but it makes you dangerous. We need more than a soldier; we need someone who believes in more than just their own conviction.""",
    "expected_title": "Captain America: Civil War"
},
{
    "query": """The hardest choices require the strongest wills. You’re strong, but I could snap my fingers, and you’d all cease to exist. I’m not doing this because I hate life. I’m doing this because I love it. A grateful universe will remember what I’ve done. It’s mercy, not cruelty. It’s balance.""",
    "expected_title": "Avengers: Infinity War"
},
{
    "query": """I am Iron Man. You think you know what that means? I wasn’t born into greatness, I didn’t have it handed to me. I built it. Piece by piece, with blood, sweat, and fear. And in doing so, I learned the cost of power. Every triumph leaves a scar, every victory comes with a price. And yet I stand here because if I don’t, who will?""",
    "expected_title": "Iron Man"
},{
    "query": """People are afraid of what they don’t understand. They look up in the sky and see you, and all they feel is fear. Fear that one day you’ll decide they’re no longer worth protecting. You say you’re here for truth and justice, but whose truth, whose justice? You’re not a god, you’re not a man, you’re something else entirely, and the world will never stop questioning you.""",
    "expected_title": "Batman v Superman: Dawn of Justice"
},
{
    "query": """My father was a lighthouse keeper. My mother was a queen. They were never meant to meet, but their love saved the world. And now it’s my turn. The land and the sea are not enemies, they’re two halves of the same whole. If I can be the bridge between them, maybe the world has a chance.""",
    "expected_title": "Aquaman"
},
{
    "query": """You don’t owe this world a thing. You never did. But if you choose to stand with them, you will inspire them. You will give them an ideal to strive toward, something beyond themselves. And even if they can’t be you, they will try. They will rise. They will follow your example. And for that, they will honor you.""",
    "expected_title": "Man of Steel"
},
{
    "query": """I am the fastest man alive, but even I can’t outrun fate. They told me I could save people, and I believed it. But no one told me about the cost, about how saving one life could mean losing another. That’s the curse of speed — you see the end coming long before anyone else, and you still can’t stop it.""",
    "expected_title": "Justice League"
},
{
    "query": """I don’t believe in fate. I don’t believe in destiny. But I believe in people, in the choices we make when everything is on the line. You can call me a villain, you can call me a vigilante, but I’ll do what’s necessary to protect this city, even if it means standing alone in the dark.""",
    "expected_title": "The Dark Knight Rises"
},{
    "query": """Life will not be contained. Life breaks free. It expands to new territories and crashes through barriers, painfully, maybe even dangerously. But it always finds a way. You built cages, you thought you were in control, but control is an illusion. You brought back something you never should have touched.""",
    "expected_title": "Jurassic Park"
},
{
    "query": """You know what the problem is? You didn’t stop to think if you should. You were so preoccupied with whether you could bring dinosaurs back that you never asked if the world was ready for them. And now they’re here, and we’re the ones trapped in a cage.""",
    "expected_title": "Jurassic Park"
},
{
    "query": """These creatures don’t need our protection; they need to be left alone. They’re not pets, they’re not attractions. They’re living, breathing reminders of our arrogance. If we don’t stop trying to control them, they will remind us who the real predators are.""",
    "expected_title": "Jurassic World: Fallen Kingdom"
},
{
    "query": """We made them. We fed them. We put them on display. And when they outgrew their cages, we tried to chain them down. But they are not ours to command. This island is theirs, not ours. And sooner or later, nature will correct our mistake.""",
    "expected_title": "Jurassic World"
},
{
    "query": """When you look into the eyes of a raptor, you understand. They’re not mindless beasts, they’re calculating. They’re testing you. And if you think for one second you’re in control, you’re already dead.""",
    "expected_title": "Jurassic World"
},{
    "query": """I am not in danger, Skyler. I am the danger. A guy opens his door and gets shot, and you think that of me? No. I am the one who knocks. You think I built all this because I wanted to die? No. I did it because I wanted to live, and I wanted us to live better than anyone ever thought possible.""",
    "expected_title": "Breaking Bad"
},
{
    "query": """When you play the game of thrones, you win or you die. There is no middle ground. Alliances crumble, oaths are broken, and the innocent are the first to bleed. Honor will not keep you alive. Mercy will not save you. The throne consumes all who reach for it.""",
    "expected_title": "Game of Thrones"
},
{
    "query": """You know what the problem with being clever is? Everyone always assumes you’re being clever when you’re simply telling the truth. Deduction is not magic, it’s observation. The world hides its answers in plain sight, but most people never stop to look.""",
    "expected_title": "Sherlock"
},
{
    "query": """Democracy is so overrated. You think power lies in ballots and speeches, but it doesn’t. Power lies in secrets, in leverage, in making your opponent believe they’ve already lost before the game begins. The people don’t want truth; they want reassurance, and I give it to them.""",
    "expected_title": "House of Cards"
},
{
    "query": """You think this world belongs to you because you survived? No. It belongs to the dead. We’re just living in their shadow, pretending we still have control. Every step we take, every breath, we’re walking on borrowed time, waiting for the day it runs out.""",
    "expected_title": "The Walking Dead"
}

]

In [85]:
from tqdm import tqdm 

results = []

for item in tqdm(ground_truth_data_long_dialogues, desc="Evaluating RAG chain"):
    query = item['query']
    expected_title = item['expected_title']

    try:
        predicted_title = rag_chain.invoke(query)
    except Exception as e:
        predicted_title = f"Error: {str(e)}" # Handle potential errors during invocation

    results.append({
        'query': query,
        'expected_title': expected_title,
        'predicted_title': predicted_title
    })

# Convert results to a Pandas DataFrame for easier analysis
results_df = pd.DataFrame(results)

Evaluating RAG chain: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


In [86]:
results_df.head()

Unnamed: 0,query,expected_title,predicted_title
0,I know what it’s like to lose. To feel so desp...,Avengers: Infinity War,Avengers: Infinity War
1,You could not live with your own failure. Wher...,Avengers: Endgame,Avengers: Endgame
2,You think you fight for us? You just fight for...,Captain America: Civil War,Avengers: Infinity War
3,The hardest choices require the strongest will...,Avengers: Infinity War,Avengers: Infinity War
4,I am Iron Man. You think you know what that me...,Iron Man,Iron Man


In [87]:
def normalize_title(title):
    if not isinstance(title, str):
        return "" 
    title = title.lower().strip()
    return title

results_df['normalized_expected_title'] = results_df['expected_title'].apply(normalize_title)
results_df['normalized_predicted_title'] = results_df['predicted_title'].apply(normalize_title)

results_df['is_correct'] = (results_df['normalized_predicted_title'] == results_df['normalized_expected_title'])

results_df[['normalized_expected_title', 'normalized_predicted_title', 'is_correct']].head()

Unnamed: 0,normalized_expected_title,normalized_predicted_title,is_correct
0,avengers: infinity war,avengers: infinity war,True
1,avengers: endgame,avengers: endgame,True
2,captain america: civil war,avengers: infinity war,False
3,avengers: infinity war,avengers: infinity war,True
4,iron man,iron man,True


In [88]:
# Calculate Exact Match Accuracy
correct_predictions = results_df['is_correct'].sum()
total_queries = len(results_df)
accuracy = correct_predictions / total_queries

print(f"\nTotal Queries: {total_queries}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Exact Match Accuracy: {accuracy:.2%}")


Total Queries: 20
Correct Predictions: 8
Exact Match Accuracy: 40.00%
