In [1]:
import os
from uuid import uuid4
import pandas as pd
from datetime import datetime, timedelta
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import ContextualCompressionRetriever, ParentDocumentRetriever, MultiQueryRetriever, EnsembleRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from qdrant_client import QdrantClient, models
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision
from datasets import Dataset
from langsmith import traceable, Client

In [2]:
# Set up environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Movies RAG - {uuid4().hex[0:8]}"
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
# Load the Movies dataset
df = pd.read_csv("movies.csv")  # Assuming you have this CSV file
loader = DataFrameLoader(df, page_content_column="overview")
documents = loader.load()

In [4]:
# Add metadata to documents
for doc in documents:
    doc.metadata.update({
        "movie_id": doc.metadata["movie_id"],
        "title": doc.metadata["title"],
        "vote_average": doc.metadata["vote_average"],
        "vote_count": doc.metadata["vote_count"],
        "status": doc.metadata["status"],
        "release_date": doc.metadata["release_date"],
        "revenue": doc.metadata["revenue"],
        "adult": doc.metadata["adult"],
        "budget": doc.metadata["budget"],
        "imdb_id": doc.metadata["imdb_id"],
        "original_language": doc.metadata["original_language"],
        "genre": doc.metadata["genres"],
        "popularity": doc.metadata["popularity"],
        "production_companies": doc.metadata["production_companies"],
        "production_countries": doc.metadata["production_countries"],
        "spoken_languages": doc.metadata["spoken_languages"],
        "last_accessed_at": datetime.now() - timedelta(days=int(doc.metadata["movie_id"]) % 30)
    })

In [5]:
len(documents)

1048575

In [6]:
import random
# Set a seed for reproducibility
random.seed(42)

# Select a subset of documents
subset_size = 5000  # Adjust this number based on your needs
subset_docs = random.sample(documents, min(subset_size, len(documents)))

print(f"Using {len(subset_docs)} documents for loadiing into database")

Using 5000 documents for loadiing into database


In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Qdrant.from_documents(
    subset_docs,
    hf_embeddings,
    location=":memory:",
    collection_name="Movies"
)

  hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [9]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Set up the generator
generator_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
critic_llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define the distribution of question types
distributions = {
    simple: 0.6,
    multi_context: 0.3,
    reasoning: 0.1
}

# Generate the testset
testset = generator.generate_with_langchain_docs(subset_docs, 25, distributions, with_debugging_logs=False)

# Convert to pandas DataFrame and save
testset_df = testset.to_pandas()
testset_df.to_csv("movies_testset.csv", index=False)

# Extract questions and ground truths
test_questions = testset_df["question"].values.tolist()
test_groundtruths = testset_df["ground_truth"].values.tolist()

print(f"Generated {len(test_questions)} test questions.")
print("Sample questions:")
for i in range(min(5, len(test_questions))):
    print(f"Q: {test_questions[i]}")
    print(f"A: {test_groundtruths[i]}")
    print()

embedding nodes:   0%|          | 0/10000 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/25 [00:00<?, ?it/s]

Generated 25 test questions.
Sample questions:
Q: How is prejudice against Vietnamese people portrayed in the film's setting of Houston, Texas?
A: Prejudice against Vietnamese people is portrayed in the film's setting of Houston, Texas as being pervasive, particularly among the fishing community. The Vietnamese mother and child face discrimination and hostility as they try to integrate and work in the area. The priest, upon discovering this prejudice, takes it upon himself to address the injustices, often resorting to physical confrontation rather than peaceful resolution.

Q: How does the disappearance of her son push the matriarch of the dysfunctional family to her tipping point?
A: The answer to given question is not present in context

Q: How does Ranveer Ching help the man who is starving due to a shortage of food?
A: Ranveer Ching helps the man who is starving due to a shortage of food by providing Chinese Ching products that help fight hunger and alleviate the food shortage.

Q:

In [10]:
# Set up retrievers
naive_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
bm25_retriever = BM25Retriever.from_documents(subset_docs)

In [11]:
# Set up LLM and prompt
chat_model = ChatOpenAI(model="gpt-4o", temperature=0)
RAG_TEMPLATE = """
You are a helpful movie expert assistant. Use the context provided below to answer the question about movies.
If you do not know the answer, or are unsure, say you don't know.

Query: {question}

Context: {context}
"""
rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [12]:
# Set up retrieval chain
def setup_retrieval_chain(retriever):
    return (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
    )

naive_retrieval_chain = setup_retrieval_chain(naive_retriever)
bm25_retrieval_chain = setup_retrieval_chain(bm25_retriever)

In [13]:
# Set up other retrievers
compressor = LLMChainExtractor.from_llm(chat_model)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=naive_retriever
)
compression_retrieval_chain = setup_retrieval_chain(compression_retriever)

In [14]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
)
multi_query_retrieval_chain = setup_retrieval_chain(multi_query_retriever)

In [21]:
# Define splitters for parent document retriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# Set up parent document retriever
parent_store = InMemoryStore()
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Get embedding dimensions
sample_text = "Sample text to get embedding dimensions"
sample_embedding = hf_embeddings.embed_query(sample_text)
embedding_dim = len(sample_embedding)

# Create Qdrant client and collection
qdrant_client = QdrantClient(":memory:")
qdrant_client.recreate_collection(
    collection_name="parent_documents",
    vectors_config=models.VectorParams(size=embedding_dim, distance=models.Distance.COSINE),
)

vectorstore = Qdrant(
    client=qdrant_client,
    collection_name="parent_documents",
    embeddings=hf_embeddings
)

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=parent_store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# Add documents to the parent retriever
parent_retriever.add_documents(subset_docs)

parent_retrieval_chain = setup_retrieval_chain(parent_retriever)

  qdrant_client.recreate_collection(


In [22]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, naive_retriever, parent_retriever, multi_query_retriever],
    weights=[0.25, 0.25, 0.25, 0.25]
)
ensemble_retrieval_chain = setup_retrieval_chain(ensemble_retriever)

In [24]:
# this code was taking over 4 hrs to run with the 5000 documents so I have reduced the number of documents to 1000 and it was taking 15 mins

semantic_subset_size = min(1000, len(subset_docs))  # Adjust this number based on your needs
semantic_subset_docs = random.sample(subset_docs, semantic_subset_size)

semantic_chunker = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
semantic_documents = semantic_chunker.split_documents(subset_docs)
semantic_vectorstore = Qdrant.from_documents(
    semantic_subset_docs,  # Use semantic_subset_docs here
    embeddings,
    location=":memory:",
    collection_name="MoviesSemantic"
)
semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k": 10})
semantic_retrieval_chain = setup_retrieval_chain(semantic_retriever)

In [25]:
# Add this function after setting up all the retrievers
def test_retriever(retrieval_chain, question):
    response = retrieval_chain.invoke({"question": question})
    print(f"Question: {question}")
    print(f"Answer: {response['response'].content}")
    print("Context snippets:")
    for i, context in enumerate(response["context"], 1):
        print(f"{i}. {context.page_content[:100]}...")  # Print first 100 characters of each context
    print("\n" + "="*50 + "\n")

# Test question
test_question = "What are some of the most popular science fiction movies?"

# Test each retriever
print("Testing Naive Retriever:")
test_retriever(naive_retrieval_chain, test_question)

print("Testing BM25 Retriever:")
test_retriever(bm25_retrieval_chain, test_question)

print("Testing Compression Retriever:")
test_retriever(compression_retrieval_chain, test_question)

print("Testing Multi-Query Retriever:")
test_retriever(multi_query_retrieval_chain, test_question)

print("Testing Parent Document Retriever:")
test_retriever(parent_retrieval_chain, test_question)

print("Testing Ensemble Retriever:")
test_retriever(ensemble_retrieval_chain, test_question)

print("Testing Semantic Retriever:")
test_retriever(semantic_retrieval_chain, test_question)

Testing Naive Retriever:
Question: What are some of the most popular science fiction movies?
Answer: Based on the context provided, I don't have enough information to list the most popular science fiction movies. The documents mainly describe a variety of films, including documentaries, dramas, and horror, but they do not provide a comprehensive list of popular science fiction movies.

However, some well-known and popular science fiction movies include:

1. **Star Wars series**
2. **Blade Runner**
3. **The Matrix**
4. **Inception**
5. **Interstellar**
6. **2001: A Space Odyssey**
7. **E.T. the Extra-Terrestrial**
8. **The Terminator series**
9. **Jurassic Park**
10. **Avatar**

If you need more specific recommendations or information, feel free to ask!
Context snippets:
1. In interviews, various actors and directors discuss their careers and their involvement in the makin...
2. This short film is an interstellar road trip to the most amazing places we have discovered in our ga...
3. Ni

In [26]:
# Test question
test_question = "What are some of the highest grossing movies?"

# Test each retriever
print("Testing Naive Retriever:")
test_retriever(naive_retrieval_chain, test_question)

print("Testing BM25 Retriever:")
test_retriever(bm25_retrieval_chain, test_question)

print("Testing Compression Retriever:")
test_retriever(compression_retrieval_chain, test_question)

print("Testing Multi-Query Retriever:")
test_retriever(multi_query_retrieval_chain, test_question)

print("Testing Parent Document Retriever:")
test_retriever(parent_retrieval_chain, test_question)

print("Testing Ensemble Retriever:")
test_retriever(ensemble_retrieval_chain, test_question)

print("Testing Semantic Retriever:")
test_retriever(semantic_retrieval_chain, test_question)

Testing Naive Retriever:
Question: What are some of the highest grossing movies?
Answer: Based on the provided context, I don't have specific information about the highest-grossing movies in general. The documents mainly discuss various movies, including some highest-grossing Telugu movies, but do not provide a comprehensive list of the highest-grossing movies overall.

For a general list of some of the highest-grossing movies, you might consider titles like "Avatar," "Avengers: Endgame," "Titanic," "Star Wars: The Force Awakens," and "Avengers: Infinity War," which are known to be among the top-grossing films worldwide.
Context snippets:
1. highest grossing telugu movies...
2. Nine short experimental films of transgressive cinema....
3. A retrospective look at the five Dirty Harry films (1971-88), starring Clint Eastwood....
4. Bollywood 1945...
5. Bollywood 1981...
6. Silent movie drama......
7. 1947.  America.  The war is over, a pandemic has come and gone, and The American People b

In [27]:
# Evaluation function
def run_evaluation(retrieval_chain, test_questions, test_groundtruths):
    answers = []
    contexts = []

    for question in test_questions:
        response = retrieval_chain.invoke({"question": question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])
    
    response_dataset = Dataset.from_dict({
        "question": test_questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": test_groundtruths
    })
    
    metrics = [faithfulness, answer_relevancy, context_recall, context_precision, answer_correctness]
    return evaluate(response_dataset, metrics)


In [28]:
# Run evaluations
@traceable(run_type="llm", name="Movies RAG Evaluation", project_name="Movies RAG Evaluation")
def run_all_evaluations():
    evaluations = {
        "Naive": run_evaluation(naive_retrieval_chain, test_questions, test_groundtruths),
        "BM25": run_evaluation(bm25_retrieval_chain, test_questions, test_groundtruths),
        "Compression": run_evaluation(compression_retrieval_chain, test_questions, test_groundtruths),
        "MultiQuery": run_evaluation(multi_query_retrieval_chain, test_questions, test_groundtruths),
        "Parent": run_evaluation(parent_retrieval_chain, test_questions, test_groundtruths),
        "Ensemble": run_evaluation(ensemble_retrieval_chain, test_questions, test_groundtruths),
        "Semantic": run_evaluation(semantic_retrieval_chain, test_questions, test_groundtruths)
    }
    return evaluations

results = run_all_evaluations()

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

No statements were generated from the answer.


Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

In [29]:
import pandas as pd

def create_evaluation_results_table(results):
    # Create individual DataFrames for each method
    dataframes = []
    for method, metrics in results.items():
        df = pd.DataFrame(list(metrics.items()), columns=['Metric', method])
        dataframes.append(df)
    
    # Merge all DataFrames
    df_merged = dataframes[0]
    for df in dataframes[1:]:
        df_merged = df_merged.merge(df, on='Metric')
    
    return df_merged

# Assuming 'results' is the dictionary returned by run_all_evaluations()
df_merged = create_evaluation_results_table(results)

# Display the table
print(df_merged)

#If you want to save the table to a CSV file
df_merged.to_csv('movies_rag_results_larger.csv', index=False)

               Metric     Naive      BM25  Compression  MultiQuery    Parent  \
0        faithfulness  0.758350  0.781288     0.694421    0.689502  0.783243   
1    answer_relevancy  0.720100  0.684377     0.641533    0.710739  0.795175   
2      context_recall  0.940000  0.793333     0.630000    0.940000  0.980000   
3   context_precision  0.497825  0.430000     0.520000    0.513779  0.583333   
4  answer_correctness  0.558643  0.560107     0.538757    0.602730  0.598917   

   Ensemble  Semantic  
0  0.772263  0.683505  
1  0.834909  0.263710  
2  0.980000  0.480000  
3  0.478304  0.194667  
4  0.634759  0.332240  


This data provides a comparison of different retrieval methods for a question-answering system.

1. Faithfulness:
   - Measures how well the generated answers align with the provided context.
   - Parent method performs best (0.783), followed closely by BM25 (0.781) and Ensemble (0.772).
   - Semantic method performs worst (0.684).

2. Answer Relevancy:
   - Indicates how relevant the generated answers are to the questions.
   - Ensemble method performs best (0.835), followed by Parent (0.795).
   - Semantic method performs significantly worse than others (0.264).

3. Context Recall:
   - Measures how well the retriever captures all the relevant information.
   - Parent and Ensemble methods perform best (0.980), followed by Naive and MultiQuery (0.940).
   - Semantic method has the lowest recall (0.480).

4. Context Precision:
   - Indicates how precise or focused the retrieved context is.
   - Parent method performs best (0.583), followed by Compression (0.520).
   - Semantic method has the lowest precision (0.195).

5. Answer Correctness:
   - Measures the accuracy of the generated answers.
   - Ensemble method performs best (0.635), followed by MultiQuery (0.603) and Parent (0.599).
   - Semantic method performs worst (0.332).

Key takeaways:

1. The Parent and Ensemble methods consistently perform well across most metrics.
2. The Semantic method underperforms significantly compared to other methods. This could be due to the reduced dataset we used for semantic processing or potential issues with the semantic chunking approach.
3. The BM25 method, despite its simplicity, performs reasonably well, especially in faithfulness.
4. The Naive method, while simple, shows competitive performance in some metrics like context recall.
5. The Compression method shows mixed results, performing well in context precision but lagging in other metrics.
6. The MultiQuery method shows strong performance in answer correctness and context recall.

Overall, this data suggests that a combination of methods (as seen in the Ensemble approach) or the Parent document method might be the most effective for your movie database question-answering system. The poor performance of the Semantic method indicates that it might need further optimization or a larger subset of data to be effective.

I also ran the evaluation earlier with 50 documents and results are much better with the larger dataset of 5000 documents. The larger dataset generally led to improved performance across most retrieval methods, with the Parent and Ensemble methods showing the most consistent improvements.

1. Faithfulness:
   - Generally improved with larger dataset, especially for Parent (0.803 to 0.783) and MultiQuery (0.756 to 0.690).
   - BM25 saw a significant improvement (0.583 to 0.781).
   - Semantic method slightly decreased (0.730 to 0.684).

2. Answer Relevancy:
   - Improved across most methods with the larger dataset.
   - Notable improvements in Parent (0.606 to 0.795) and Ensemble (0.688 to 0.835).
   - Semantic method saw a significant decrease (0.607 to 0.264).

3. Context Recall:
   - Remained high for most methods in both datasets.
   - Parent method improved (0.917 to 0.980).
   - Semantic method decreased (0.975 to 0.480).

4. Context Precision:
   - Generally improved with the larger dataset.
   - Most notable improvement in Parent (0.400 to 0.583).
   - Semantic method decreased (0.377 to 0.195).

5. Answer Correctness:
   - Improved across all methods with the larger dataset.
   - Largest improvements in Ensemble (0.517 to 0.635) and Parent (0.501 to 0.599).
   - Semantic method decreased (0.532 to 0.332).

Key observations:

1. Overall Improvement: Most methods benefited from the larger dataset, showing improvements across multiple metrics. This suggests that more data generally leads to better performance in retrieval and question-answering tasks.

2. Parent Method: Showed consistent improvement across all metrics with the larger dataset, emerging as one of the top performers.

3. Ensemble Method: Significantly improved, especially in answer relevancy and correctness, indicating that it benefits from diverse information in a larger dataset.

4. BM25: Saw substantial improvements, particularly in faithfulness, showing that traditional keyword-based methods can still be very effective with more data.

5. Semantic Method: Interestingly, it's the only method that consistently performed worse with the larger dataset. This could be due to:
   a) The reduced subset we used for semantic processing in the larger dataset.
   b) Potential scalability issues with the semantic chunking approach.
   c) The semantic method might be more sensitive to noise in a larger, more diverse dataset.

6. Compression and MultiQuery: Showed mixed results, with improvements in some metrics and slight decreases in others.

The significant decrease in performance for the Semantic method with the larger dataset warrants further investigation and potential optimization of the semantic chunking and retrieval process for larger datasets.