# RAG application for game research 


In [34]:
# Import necessary libraries
import os
from typing import List, Dict, Any, Optional
import numpy as np
import re
from tqdm.auto import tqdm
import pandas as pd

# LangSmith imports for tracing
from langsmith import traceable
from langchain.callbacks.tracers import LangChainTracer
from langchain.callbacks.manager import CallbackManager

# LangChain imports
from langchain_qdrant import Qdrant
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from qdrant_client import QdrantClient
from qdrant_client.http import models as rest


In [35]:

# Function to extract game name from query - keeping for backward compatibility
def extract_game_name(query: str, games_csv_path: str = "games.csv") -> Optional[str]:
    """
    Extract game name from a user query by checking against a CSV file of game names.
    
    Args:
        query: The user's query text
        games_csv_path: Path to CSV file containing game names
        
    Returns:
        The extracted game name or None if no game found
    """
    # Check if CSV file exists
    if not os.path.exists(games_csv_path):
        print(f"Warning: Games CSV file {games_csv_path} not found.")
        return None
    
    try:
        # Load game names from CSV
        games_df = pd.read_csv(games_csv_path, header=None)
        game_names = games_df[0].tolist()  # Assuming the game names are in the first column
        
        # Normalize query for matching
        normalized_query = query.lower()
        
        # Sort game names by length (descending) to prioritize longer matches
        for game in sorted(game_names, key=len, reverse=True):
            # Check for case-insensitive match
            if game.lower() in normalized_query:
                return game
        
        # Enhanced detection with patterns
        patterns = [
            r"for\s+(.+?)(?:\s+game|\s+reviews|\s+in|\?|$)",  # "for [game]"
            r"about\s+(.+?)(?:\s+game|\s+reviews|\?|$)",      # "about [game]"
            r"in\s+(.+?)(?:\s+game|\s+reviews|\?|$)",         # "in [game]"
        ]
        
        for pattern in patterns:
            matches = re.search(pattern, normalized_query)
            if matches:
                potential_game = matches.group(1).strip()
                # Find the closest match in our game list
                for game in game_names:
                    if potential_game in game.lower() or game.lower() in potential_game:
                        return game
        
        return None
        
    except Exception as e:
        print(f"Error extracting game name: {e}")
        return None


In [36]:

@traceable(name="setup_rag_retriever")
def setup_rag_retriever(
    collection_name: str = "steam_reviews",
    openai_api_key: str = None,
    embedding_model: str = "text-embedding-3-small",
    search_type: str = "similarity",
    k: int = 4
):
    """
    Set up a langchain retriever from an existing Qdrant collection with tracing.
    No game filtering - will format game name in the documents during context preparation.
    """
    if openai_api_key is None:
        openai_api_key = os.environ.get("OPENAI_API_KEY")
        
    # Setup embeddings
    embeddings = OpenAIEmbeddings(
        model=embedding_model,
        openai_api_key=openai_api_key
    )
    
    # Connect to existing Qdrant collection
    client = QdrantClient(host="localhost", port=6333)
    
    # Create vector store without game filtering
    vector_store = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=embeddings,
    )
    
    # Create retriever with specified search parameters
    retriever = vector_store.as_retriever(
        search_type=search_type,
        search_kwargs={"k": k}
    )
    
    return retriever


In [37]:

# Initialize callback manager for tracing
def get_tracer_callback_manager():
    tracer = LangChainTracer(project_name=os.environ.get("LANGCHAIN_PROJECT", "steam-reviews-rag"))
    return CallbackManager([tracer])


In [38]:

# Custom function to format documents
def format_documents(docs):
    """
    Format each document to include game name from metadata.
    Format: "Review for {game_name}: {review_content}"
    """
    formatted_docs = []
    for doc in docs:
        game_name = doc.metadata.get("game_name", "Unknown Game")
        formatted_text = f"Review for \"{game_name}\": {doc.page_content}"
        formatted_docs.append(formatted_text)
    return "\n\n".join(formatted_docs)


In [39]:

# Updated RAG system setup without game filtering
@traceable(name="setup_rag_system")
def setup_rag_system(collection_name="steam_reviews", k=25, model="gpt-4o-mini", temperature=0):
    """
    Set up RAG system that formats review context to include game name from metadata.
    """
    # Set up client
    client = QdrantClient(host="localhost", port=6333)
    
    # Set up retriever without game filtering
    retriever = setup_rag_retriever(
        collection_name=collection_name, 
        k=k
    )
    
    # Set up prompt template
    template = """You are an AI assistant helping game developers improve their games based on Steam reviews.
    To answer the question, you will be given a list of reviews mentioning the game requested. The reviews are formatted as follows:
    "Review for [game_name]: [review_content]"
    Always check that the game name is correct for the reviews considered. If the game requested does not match the game passed in one 
    or more reviews then ignore the reviews that do not match the game name, unless the content of the review is relevant to the question.
    If you don't know the answer, just say that you don't know. If the question is not related to the game, just say that you don't know.
    If the question is about a game not explicitly present in the context, just say that you don't know.
    Always verify that the game specified in the question is present in the context before answering.
    
    Context:
    {context}
    
    Question: {question}
    Answer:"""
    
    PROMPT = PromptTemplate(
        template=template,
        input_variables=["context", "question"]
    )
    
    # Set up LLM
    llm = ChatOpenAI(
        temperature=temperature, 
        model=model
    )
    
    # Class to customize document formatting for RAG
    class CustomFormatRAG:
        def __init__(self, llm, retriever, prompt):
            self.llm = llm
            self.retriever = retriever
            self.prompt = prompt
        
        def invoke(self, params):
            query = params.get("query", "")
            
            # Execute retrieval to get documents
            docs = self.retriever.get_relevant_documents(query)
            
            # Format documents to include game name
            formatted_context = format_documents(docs)
            
            # Execute the query with the LLM
            llm_response = self.llm.invoke(
                self.prompt.format(
                    context=formatted_context,
                    question=query
                )
            )
            
            # Format the response to match the expected structure
            result = {
                "result": llm_response.content,
                "source_documents": docs
            }
            
            return result
    
    # Create and return our custom RAG implementation
    return CustomFormatRAG(llm, retriever, PROMPT)


In [40]:

# Function to run RAG query
@traceable(name="run_rag_query")
def run_rag_query(qa_system, query):
    """
    Run a RAG query with comprehensive tracing for performance analysis
    """
    # Execute query using invoke method
    result = qa_system.invoke({"query": query})
    
    # Calculate result metrics
    run_metrics = {
        "query": query,
        "result_length": len(result.get("result", "")),
        "num_source_docs": len(result.get("source_documents", [])),
    }
    
    return result, run_metrics


# Usage 


In [41]:

# Usage example
if __name__ == "__main__":
    # Check if LangSmith credentials are set
    if not os.environ.get("LANGCHAIN_API_KEY"):
        print("Warning: LANGCHAIN_API_KEY not set. LangSmith tracing will not work.")
        
    # Set up the RAG system with tracing
    qa_chain = setup_rag_system(collection_name="steam_reviews", k=25)
    
    # Run a query with detailed performance tracing
    query = "What is the most common complaint about Counter-Strike Global Offensive in comparison to older Counter-Strike games?"
    result, metrics = run_rag_query(qa_chain, query)
    
    # Print the result
    print(f"Result: {result['result']}")
    print(f"Metrics: {metrics}")

Result: The most common complaint about Counter-Strike: Global Offensive (CS:GO) in comparison to older Counter-Strike games is the lack of innovation and significant changes to gameplay. Many reviews express that CS:GO feels like a rehash of its predecessors with minimal improvements, leading to a stale experience for longtime fans. Additionally, issues such as unbalanced gameplay, inconsistent hit registration, and a toxic community are frequently mentioned, contributing to a sense of frustration among players. The matchmaking system is also criticized for being inefficient and unbalanced, further detracting from the overall experience.
Metrics: {'query': 'What is the most common complaint about Counter-Strike Global Offensive in comparison to older Counter-Strike games?', 'result_length': 638, 'num_source_docs': 25}
