# Imports

In [1]:
import os
import pandas as pd
import json
import chromadb

from chromadb.config import Settings

from dotenv import load_dotenv
from openai import OpenAI
from groq import Groq
from tqdm import tqdm

load_dotenv()

True

# Load Data

In [2]:
with open('../data/processed/reviews.json', encoding='utf-8') as f_in:
    reviews = json.loads(f_in.read())

In [3]:
reviews[0]

{'recommendationid': '172440169',
 'language': 'english',
 'review': "terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no new content for months\n- no new operation (it's been 3+ years)\n- tons of competitive/wingman/hostage maps missing from csgo\n- deleted achievements from csgo with 1 pointless one just make new ones\n- no danger zone\n- no team deatmatch\n- sub tick is garbage all we wanted was 128tick servers\n- spaghetti net code\n- no short mm\n- no flying scoutsman\n- replays are bugged\n- no overwatch\n- performance issues\n- peekers advantage\n- bad ranking system\n- no economy changes despite MR12\n- movement and shooting is worse compared to csgo\n\nAll of this while they make hundreds of millions of dollars from cases/keys and yet are unable to make any fun new content unbelievable, and yet they waste their time with boring hero shooter deadlock that they will abandon within th

# Generate Questions

In [4]:
# prompt_template="""Based on the game's reviews, I'm creating a project to help video game players decide whether or not to buy a game based on game reviews made by other players. 
# From the review below, simulate 5 questions that a player can ask based on a provided review. The questions should be specific to this review.

# Review:

# {review}

# Output:

# {{"questions": ["question1", "question2", ..., "question5"]}}
# """.strip()

In [5]:
prompt_template="""You will receive a Review of a game from Steam. Based on the content of this review, your task is to simulate 5 questions that new or potential players may have about the game. 
Questions should cover different important aspects included in the review, such as technical issues, performance, available content, game modes, developer decisions, among others.

** Instructions **
    * Always Provide the output in parsable JSON
    * Return the output without any code blocks
    * Just return the JSON output without writing anything before or after

Review:

{review}

Output:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [6]:
def llm(prompt: str) -> str:
    client = Groq(
            api_key=os.environ.get("GROQ_API_KEY"),
        )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt ,
            }
        ],
        model="llama3-8b-8192",
    )
    
    return chat_completion.choices[0].message.content

In [7]:
# client = OpenAI()
# def llm(prompt: str) -> str:
#     response = client.chat.completions.create(
#         model='gpt-4o-mini',
#         messages=[{"role": "user", "content": prompt}]
#     )
    
#     return response.choices[0].message.content

In [8]:
def generate_questions(review: dict[str, str], prompt_template: str) -> list[str]:
    prompt = prompt_template.format(**review)

    questions = llm(prompt)

    return questions
    

In [9]:
reviews[0]

{'recommendationid': '172440169',
 'language': 'english',
 'review': "terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no new content for months\n- no new operation (it's been 3+ years)\n- tons of competitive/wingman/hostage maps missing from csgo\n- deleted achievements from csgo with 1 pointless one just make new ones\n- no danger zone\n- no team deatmatch\n- sub tick is garbage all we wanted was 128tick servers\n- spaghetti net code\n- no short mm\n- no flying scoutsman\n- replays are bugged\n- no overwatch\n- performance issues\n- peekers advantage\n- bad ranking system\n- no economy changes despite MR12\n- movement and shooting is worse compared to csgo\n\nAll of this while they make hundreds of millions of dollars from cases/keys and yet are unable to make any fun new content unbelievable, and yet they waste their time with boring hero shooter deadlock that they will abandon within th

In [10]:
questions = generate_questions(reviews[0], prompt_template)
questions

'{"questions": ["What are the common issues with the anti-cheat system in the game?", "Is there a regular flow of new content and updates to the game?", "What are the game modes available in the game, and are there any missing modes?", "What are the technical issues with the game, and are there any plans to address them?", "Why is there a perception that the game\'s focus is misguided, with too much emphasis on certain aspects and not enough on others?"]}'

In [11]:
questions = json.loads(questions)

## Generate all Questions

In [17]:
results = {}

In [96]:
for review in tqdm(reviews):
    review_id = review['recommendationid']
    if review_id in results:
        continue
    q = generate_questions(review, prompt_template)
    try:
        questions = json.loads(q)
    except Exception as error:
        print(review_id)
        print(q)
        raise

    results[review_id] = questions['questions']

100%|███████████████████| 300/300 [00:25<00:00, 11.66it/s]


# Save Questions

In [97]:
final_results = []

for review_id, questions in results.items():
    for q in questions:
        final_results.append((review_id, q))

In [98]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [56]:
df_reviews = pd.DataFrame(reviews)
df_results.id = df_results.id.astype(str)
df_reviews.recommendationid = df_reviews.recommendationid.astype(str)
df_results = pd.merge(df_results, df_reviews[['recommendationid', 'game']], left_on='id', right_on=['recommendationid'])
df_results.drop(['recommendationid'], axis=1)
df_results.head()

Unnamed: 0,id,question,recommendationid,game
0,172440169,What are the chances of getting a false ban in...,172440169,cs2
1,172440169,Is the game getting regular updates with new c...,172440169,cs2
2,172440169,"Are all game modes available, including compet...",172440169,cs2
3,172440169,How does the game's net code and server perfor...,172440169,cs2
4,172440169,Is there a ranking system in the game that tak...,172440169,cs2


In [57]:
df_results.to_csv('../data/processed/ground-truth-retrieval.csv', index=False)

# ChromaDB

In [12]:
settings = Settings(persist_directory='../chroma', is_persistent=True)
client = chromadb.Client(settings)

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.get_or_create_collection("reviews")



In [13]:
collection.count()

300

# Evaluate RAG

## Text Search

In [59]:
df_results = pd.read_csv('../data/processed/ground-truth-retrieval.csv')

In [60]:
ground_truth = df_results.to_dict(orient='records')
ground_truth[0]

{'id': 172440169,
 'question': 'What are the chances of getting a false ban in this game?',
 'recommendationid': 172440169,
 'game': 'cs2'}

In [64]:
def search(query:str, game:str = None) -> list[str]:
    results = collection.query(
    query_texts=[query],
    n_results=10,
    where={"game": game}
)
    return results

In [65]:
results = search(ground_truth[0]['question'], ground_truth[0]['game'])

In [66]:
results

{'ids': [['173478408',
   '171783807',
   '174023371',
   '172336119',
   '172336456',
   '172318115',
   '174045052',
   '171354474',
   '172440169',
   '174019744']],
 'distances': [[0.541038990020752,
   0.5709561109542847,
   0.5890097618103027,
   0.601252555847168,
   0.604134202003479,
   0.6054655313491821,
   0.6419882774353027,
   0.6502718925476074,
   0.6533358097076416,
   0.6576564311981201]],
 'metadatas': [[{'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'},
   {'game': 'cs2', 'language': 'english'}]],
 'embeddings': None,
 'documents': [['floded with cheaters i played wingman w/ my homie ( he was cheating only after they did) and we got banned

In [18]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [19]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [67]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = search(q['question'], game=q['game'])
    print
    relevance = [d == str(doc_id) for d in results['ids'][0]]
    relevance_total.append(relevance)

100%|█████████████████| 1500/1500 [01:17<00:00, 19.33it/s]


In [68]:
hit_rate(relevance_total), mrr(relevance_total)

(0.46, 0.2490748677248676)

## Embeddings

In [69]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [70]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')



In [71]:
chroma = Chroma(collection_name='reviews_embeddings',embedding_function=embeddings)
chroma.reset_collection()

In [72]:
def chunk_text(text: str, chunk_size: int=256) -> list[str]:
    """Split text into chunks of a specified size."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
    
    return text_splitter.split_text(text)

def embed_and_store_reviews(reviews):
    """Generate embeddings and store them in ChromaDB with metadata."""
    for review_data in reviews:
        review = review_data['review']
        chunks = chunk_text(review)
    
        metadatas = []
        for chunck in chunks:
            metadatas.append( {
            'recommendationid': review_data['recommendationid'],
            'language': review_data['language'],
            'game': review_data['game']
        })
   
    
        # Store embedding in ChromaDB with metadata
        chroma.add_texts(texts=chunks, metadatas=metadatas)
        

In [73]:
embed_and_store_reviews(reviews)

In [74]:
ground_truth[0]

{'id': 172440169,
 'question': 'What are the chances of getting a false ban in this game?',
 'recommendationid': 172440169,
 'game': 'cs2'}

In [75]:
results = chroma.similarity_search(ground_truth[0]['question'], k=10, filter={'game':ground_truth[0]['game']})

In [76]:
results

[Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '174023371'}, page_content='- No real anti-cheat (which has been literally the most requested fix since the beginning of CS:GO [2012 btw!]). Because of this, not only are cheaters not getting banned, but completely innocent people are instead. Not to mention the fact that the higher'),
 Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '173478408'}, page_content='floded with cheaters i played wingman w/ my homie ( he was cheating only after they did) and we got banned and met them 2 games in a row + valve doing nothing bout it and they are still spinning in matches while we are banned and my ranks will prob get'),
 Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '172440169'}, page_content="terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no new content for mo

In [77]:
for d in results:
   display(d.metadata['recommendationid'])

'174023371'

'173478408'

'172440169'

'174059061'

'171783807'

'171362623'

'172336456'

'174045052'

'172336119'

'172318115'

In [78]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = chroma.similarity_search(q['question'], filter={'game':q['game']},k=10)
    relevance = [d.metadata['recommendationid'] == str(doc_id) for d in results]
    relevance_total.append(relevance)

100%|█████████████████| 1500/1500 [00:24<00:00, 61.85it/s]


In [79]:
hit_rate(relevance_total), mrr(relevance_total)

(0.4573333333333333, 0.2842103174603172)

In [189]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class LangChainChromaRAG:
    def __init__(self, collection_name, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2", persist_directory="./chroma_db_new"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
        
        self.vectorstore = Chroma(
            collection_name=collection_name,
            embedding_function=self.embeddings,
            persist_directory=persist_directory
        )
        
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=256,
            chunk_overlap=50,
            length_function=len
        )

    def add_game_reviews(self, reviews):
        new_reviews = []
        for review in reviews:
            if not self._review_exists(review['recommendationid']):
                new_reviews.append(review)
            else:
                print(f"Review with ID {review['recommendationid']} already exists. Skipping.")

        for review in new_reviews:
            chunks = self.text_splitter.split_text(review['review'])
            metadatas = [{
                'recommendationid': review['recommendationid'],
                'language': review['language'],
                'game': review['game'],
                'chunk_index': i  # Add chunk index to metadata
            } for i in range(len(chunks))]
            
            self.vectorstore.add_texts(texts=chunks, metadatas=metadatas)

    def _review_exists(self, recommendationid):
        # Check if any document exists with the given recommendationid
        results = self.vectorstore.similarity_search(
            "dummy query",  # The query doesn't matter here
            k=1,
            filter={"recommendationid": recommendationid}
        )
        return len(results) > 0

    def search(self, query, n_results=5, filter=None):
        return self.vectorstore.similarity_search(query, k=n_results, filter=filter)

    def get_review(self, recommendationid):
        chunks = self.vectorstore.similarity_search(
            "dummy query",  # The query doesn't matter here
            k=100,  # Set a high number to retrieve all chunks
            filter={"recommendationid": recommendationid}
        )
        # Sort chunks by chunk_index and concatenate
        sorted_chunks = sorted(chunks, key=lambda x: x.metadata['chunk_index'])
        full_review = " ".join(chunk.page_content for chunk in sorted_chunks)
        return full_review

    def update_review(self, review):
        # First, remove the existing review
        self.delete_review(review['recommendationid'])
        # Then add the updated review
        self.add_game_reviews([review])

    def delete_review(self, recommendationid:str):
        # Get the underlying Chroma collection
        collection = self.vectorstore._collection
        
        # Query for documents with the given recommendationid
        results = collection.get(
            where={"recommendationid": recommendationid},
            include=["metadatas"]
        )
        
        # Extract the ids from the metadatas
        ids_to_delete = [id for id in results['ids'] if id]
        
        # Delete the documents by their ids
        if ids_to_delete:
            collection.delete(ids=ids_to_delete)
        else:
            print(f"No documents found with recommendationid: {recommendationid}")


In [186]:
rag = LangChainChromaRAG("game_reviews")

In [None]:
rag.add_game_reviews(reviews)

In [177]:
ground_truth[0]

{'id': 172440169,
 'question': 'What are the chances of getting a false ban in this game?',
 'recommendationid': 172440169,
 'game': 'cs2'}

In [178]:
rag.search(ground_truth[0]['question'], filter={'game': ground_truth[0]['game']})

[Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '174023371'}, page_content='- No real anti-cheat (which has been literally the most requested fix since the beginning of CS:GO [2012 btw!]). Because of this, not only are cheaters not getting banned, but completely innocent people are instead. Not to mention the fact that the higher'),
 Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '173478408'}, page_content='floded with cheaters i played wingman w/ my homie ( he was cheating only after they did) and we got banned and met them 2 games in a row + valve doing nothing bout it and they are still spinning in matches while we are banned and my ranks will prob get'),
 Document(metadata={'chunk_index': 0, 'game': 'cs2', 'language': 'english', 'recommendationid': '172440169'}, page_content="terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no 

In [94]:
chroma.similarity_search(ground_truth[0]['question'], k=5, filter={'game':ground_truth[0]['game']})

[Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '174023371'}, page_content='- No real anti-cheat (which has been literally the most requested fix since the beginning of CS:GO [2012 btw!]). Because of this, not only are cheaters not getting banned, but completely innocent people are instead. Not to mention the fact that the higher'),
 Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '173478408'}, page_content='floded with cheaters i played wingman w/ my homie ( he was cheating only after they did) and we got banned and met them 2 games in a row + valve doing nothing bout it and they are still spinning in matches while we are banned and my ranks will prob get'),
 Document(metadata={'game': 'cs2', 'language': 'english', 'recommendationid': '172440169'}, page_content="terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no new content for mo

In [179]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = rag.search(q['question'], filter={'game': q['game']}, n_results=10)
    relevance = [d.metadata['recommendationid'] == str(doc_id) for d in results]
    relevance_total.append(relevance)

100%|█████████████████| 1500/1500 [00:25<00:00, 58.98it/s]


In [180]:
hit_rate(relevance_total), mrr(relevance_total)

(0.466, 0.28848999518999496)