# RAG application for game research 


In [1]:
# Import necessary libraries
import os
from typing import List, Dict, Any, Optional
import numpy as np
import re
from tqdm.auto import tqdm
import pandas as pd


# LangSmith imports for tracing
from langsmith import traceable
from langchain.callbacks.tracers import LangChainTracer
from langchain.callbacks.manager import CallbackManager

# LangChain imports
from langchain_qdrant import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from qdrant_client import QdrantClient
from qdrant_client.http import models as rest

In [2]:
def extract_game_name(query: str, games_csv_path: str = "games.csv") -> Optional[str]:
    """
    Extract game name from a user query by checking against a CSV file of game names.
    
    Args:
        query: The user's query text
        games_csv_path: Path to CSV file containing game names
        
    Returns:
        The extracted game name or None if no game found
    """
    # Check if CSV file exists
    if not os.path.exists(games_csv_path):
        print(f"Warning: Games CSV file {games_csv_path} not found.")
        return None
    
    try:
        # Load game names from CSV
        games_df = pd.read_csv(games_csv_path, header=None)
        game_names = games_df[0].tolist()  # Assuming the game names are in the first column
        
        # Normalize query for matching
        normalized_query = query.lower()
        
        # Sort game names by length (descending) to prioritize longer matches
        for game in sorted(game_names, key=len, reverse=True):
            # Check for case-insensitive match
            if game.lower() in normalized_query:
                return game
        
        # Enhanced detection with patterns
        patterns = [
            r"for\s+(.+?)(?:\s+game|\s+reviews|\s+in|\?|$)",  # "for [game]"
            r"about\s+(.+?)(?:\s+game|\s+reviews|\?|$)",      # "about [game]"
            r"in\s+(.+?)(?:\s+game|\s+reviews|\?|$)",         # "in [game]"
        ]
        
        for pattern in patterns:
            matches = re.search(pattern, normalized_query)
            if matches:
                potential_game = matches.group(1).strip()
                # Find the closest match in our game list
                for game in game_names:
                    if potential_game in game.lower() or game.lower() in potential_game:
                        return game
        
        return None
        
    except Exception as e:
        print(f"Error extracting game name: {e}")
        return None

In [3]:
@traceable(name="setup_rag_retriever")
def setup_rag_retriever(
    collection_name: str = "steam_reviews",
    openai_api_key: str = None,
    embedding_model: str = "text-embedding-3-small",
    search_type: str = "similarity",
    k: int = 4,
    game_name: str = None
):
    """
    Set up a langchain retriever from an existing Qdrant collection with tracing.
    Adds ability to filter by game name.
    """
    if openai_api_key is None:
        openai_api_key = os.environ.get("OPENAI_API_KEY")
        
    # Setup embeddings
    embeddings = OpenAIEmbeddings(
        model=embedding_model,
        openai_api_key=openai_api_key
    )
    
    # Connect to existing Qdrant collection
    client = QdrantClient(host="localhost", port=6333)
    
    # Create a filter if game_name is provided using the exact dictionary format from Qdrant docs
    search_kwargs = {"k": k}
    if game_name:
        # This is the correct filter format according to Qdrant documentation
        search_kwargs["filter"] = {
            "must": [
                {
                    "key": "game_name",
                    "match": {
                        "value": game_name
                    }
                }
            ]
        }
        print(f"Filtering for game: {game_name}")
    
    # Create vector store with the filter
    vector_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embeddings,
    )
    
    # Create retriever with the specified search type and filter
    retriever = vector_store.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs
    )
    
    return retriever

In [4]:
# Initialize callback manager for tracing
def get_tracer_callback_manager():
    tracer = LangChainTracer(project_name=os.environ.get("LANGCHAIN_PROJECT", "steam-reviews-rag"))
    return CallbackManager([tracer])


In [5]:
# Updated RAG system setup with game name extraction
@traceable(name="setup_rag_system")
def setup_rag_system(collection_name="steam_reviews", k=25, model="gpt-4o-mini", temperature=0):
    """
    Set up RAG system with the ability to filter by game name if specified in queries.
    """
    # Set up client for game name extraction
    client = QdrantClient(host="localhost", port=6333)
    
    # We'll set up the retriever in the query function after game extraction
    
    # Set up prompt template
    template = """You are an AI assistant helping game developers improve their games based on Steam reviews.
    Use the following pieces of context (review excerpts) to answer the question.
    If you don't know the answer, just say that you don't know. If the question is not related to the game, just say that you don't know. 
    If the question is about a game not explicitly present in the context, just say that you don't know. 
    Always verify that the game specified in the question is present in the context before answering. 
    
    Context:
    {context}
    
    Question: {question}
    Answer:"""
    
    PROMPT = PromptTemplate(
        template=template,
        input_variables=["context", "question"]
    )
    
    # Set up LLM with tracing
    llm = ChatOpenAI(
        temperature=temperature, 
        model=model
    )
    
    # Create a wrapper class that will extract game name on each query
    class GameFilteredRAG:
        def __init__(self, llm, prompt, client, collection_name, k):
            self.llm = llm
            self.prompt = prompt
            self.client = client
            self.collection_name = collection_name
            self.k = k
        
        def invoke(self, params):
            query = params.get("query", "")
            
            # Extract game name from query
            game_name = extract_game_name(query, games_csv_path="games.csv")
            
            # Set up retriever with game filter if applicable
            retriever = setup_rag_retriever(
                collection_name=self.collection_name, 
                k=self.k,
                game_name=game_name
            )
            
            # Create QA chain with the configured retriever
            qa = RetrievalQA.from_chain_type(
                llm=self.llm,
                retriever=retriever,
                chain_type_kwargs={"prompt": self.prompt},
                return_source_documents=True
            )
            
            # Log which game is being filtered (if any)
            if game_name:
                print(f"Filtering results for game: {game_name}")
            else:
                print("No specific game detected in query. Searching across all games.")
            
            # Execute query
            result = qa.invoke({"query": query})
            return result
    
    # Create and return our custom RAG implementation
    return GameFilteredRAG(llm, PROMPT, client, collection_name, k)

In [6]:
# Updated function to run RAG query with game filtering
@traceable(name="run_rag_query")
def run_rag_query(qa_system, query):
    """
    Run a RAG query with game name filtering and comprehensive tracing for performance analysis
    """
    # Execute query using invoke method instead of __call__
    result = qa_system.invoke({"query": query})
    
    # Calculate result metrics
    run_metrics = {
        "query": query,
        "result_length": len(result.get("result", "")),
        "num_source_docs": len(result.get("source_documents", [])),
    }
    
    return result, run_metrics

# Usage 


In [7]:
if __name__ == "__main__":
    # Check if LangSmith credentials are set
    if not os.environ.get("LANGCHAIN_API_KEY"):
        print("Warning: LANGCHAIN_API_KEY not set. LangSmith tracing will not work.")
        quit 
        
    # Set up the RAG system with tracing
    qa_chain = setup_rag_system(collection_name="steam_reviews", k=250)
    
    # Run a query with detailed performance tracing
    query = "What is the most commong complaint about Counter-Strike?"
    result, metrics = run_rag_query(qa_chain, query)
    
    # Print the result
    print(f"Result: {result['result']}")
    print(f"Metrics: {metrics}")

  llm = ChatOpenAI(
  vector_store = Qdrant(


Filtering for game: Counter-Strike
Filtering results for game: Counter-Strike
Result: I don't know.
Metrics: {'query': 'What is the most commong complaint about Counter-Strike?', 'result_length': 13, 'num_source_docs': 0}
