# Imports

In [1]:
import pandas as pd
import json
import os
import chromadb

from chromadb.config import Settings

from dotenv import load_dotenv
from groq import Groq


load_dotenv()

True

# Load Data

In [2]:
df = pd.read_json('../data/processed/reviews.json', orient='records')

In [3]:
df['recommendationid'] = df['recommendationid'].astype(str)

In [4]:
df.head()

Unnamed: 0,recommendationid,language,review,game
0,172440169,english,terrible anti cheat people are getting false b...,cs2
1,171726642,english,this is the worst game I've ever played. It ma...,cs2
2,171674861,english,this game has caused me multiple brain injures...,cs2
3,172213464,english,Despite Counter Strike having a prestigious & ...,cs2
4,172485882,english,By adding sexual content to this game it would...,cs2


In [5]:
df.game.unique()

array(['cs2', 'black_myth', 'dota2'], dtype=object)

# ChromaDB

In [6]:
settings = Settings(persist_directory='../chroma', is_persistent=True)

In [7]:
client = chromadb.Client(settings)

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("reviews", metadata={"hnsw:space": "cosine"})



In [56]:
documents = df['review'].to_list()
metadatas = df[['language', 'game']].to_dict(orient='records')
ids = df['recommendationid'].to_list()



In [9]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
)

In [10]:
collection.count()

300

In [11]:
results = collection.query(
    query_texts=["Does the game have any unique features or mechanics?"], # Chroma will embed this for you
    n_results=2 # how many results to return
)

In [12]:
results['documents']

[['A very difficult game, but it is also interesting because you will always have to learn in it, it is never the same!',
  'The game itself is nothing unique, but its done so well that it feels like it is. I was having fun with the game right away, but really when i hit chapter 2 i was freaking hooked. Something about the lore, the story telling, the relatively deep RPG elements, the combat - it just feels so damn fun to play. Im half way through the game at the time of writing this review, but its just so much fun i had to add to the good vibes online for the game. \n\nTHAT being said, does it drive anyone else crazy that the game doesn\'t have any terrain climbing mechanics? I know thats typical for a "souls like" game, but dude COME ON you are a Monkey king and you cant even climb a rock or a tree?!? - maybe in the sequel lol']]

In [22]:
results = collection.query(
    query_texts=["Will I experience false bans for playing the game?"],
    n_results=3,
    where={"game": "cs2"}
)

In [23]:
results['documents']

[['floded with cheaters i played wingman w/ my homie ( he was cheating only after they did) and we got banned and met them 2 games in a row + valve doing nothing bout it and they are still spinning in matches while we are banned and my ranks will prob get reseted GR8 JOB VALVE F-ING PROUD OF YDOU',
  "How bad can this game get, 25 years and I cannot even play a premier game without cheaters, how tf is this even allowed, valve you are a terrible company, get an anti cheat it's not hard you money chasing dogs",
  "too much cheaters and toxic players. they will find ways to make you either get banned. say abusing the greiving report or intentionally cross your fire while you engage enemy. even in premier matches there are cheaters! VAC doesnt seem to work well for some reason.. the weekly rewards also quite a joke... i don't think its worth it. i would rather pay for faceit subscription"]]

In [14]:
def search(query:str, game:str) -> list[str]:
    results = collection.query(
    query_texts=[query],
    n_results=2,
    where={"game": game}
)
    return results['documents']

In [15]:
def llm(prompt: str) -> str:
    client = Groq(
            api_key=os.environ.get("GROQ_API_KEY"),
        )
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt ,
            }
        ],
        model="llama3-8b-8192",
    )
    
    return chat_completion.choices[0].message.content

In [16]:
prompt_template = """You're a review assistant. Based on the CONTEXT provided, generate a relevant and informative response to the user's QUESTION. 
The answer must directly address the question asked, using the information available in the context of the review to provide a clear, useful and contextualized answer.

QUESTION: {question}

CONTEXT: 
{context}

""".strip()


In [17]:
def rag(query: str, game: str, prompt_template: str) -> str:
    search_result = search(query, game)

    prompt = prompt_template.format(question=query, context = search_result)

    answer = llm(prompt)

    return answer

In [24]:
query = "Will I experience false bans for playing the game?"
game = "cs2"

answer = rag(query, game, prompt_template)

In [25]:
print(answer)

Based on the context provided, it seems that the user is expressing frustration and disappointment with the game's lack of effective anti-cheating measures, which has resulted in false bans and unfair treatment of non-cheating players.

In response to your question, "Will I experience false bans for playing the game?", based on this context, it's likely that the user has had a negative experience with false bans, but it's difficult to generalize and predict whether you will experience the same issue. However, the user's testimonial suggests that the game has a significant problem with cheating, and Valve's inaction may contribute to a culture where cheating is tolerated or even rewarded.

It's possible that the game's ban system may be flawed or overaggressive, leading to false bans for players who are not cheating. This issue may be particularly problematic for players who are playing with friends, as the user in the context is claiming that they were banned alongside their cheating f

In [26]:
query = "Have Cs2\'s developers been addressing false bans due to high sensitivity and AMD drivers?"
game = "cs2"

answer = rag(query, game, prompt_template)
print(answer)

Based on the provided context, it appears that the developers of Cs2 have been inconsistent in addressing the issue of false bans caused by high sensitivity and AMD drivers. The reviewer mentions that "terrible anti-cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned" which implies that the anti-cheat system is not accurately detecting cheats. This is a recurring problem that has not been adequately addressed by the developers.


# Chroma with Chunks

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [15]:
texts = text_splitter.create_documents(df.review)
print(texts[0])
print(texts[1])

page_content='terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands'
page_content='commands while real cheaters are not getting banned'


In [35]:
from langchain_community.document_loaders import DataFrameLoader

from langchain_core.documents import Document

from langchain_chroma import Chroma

In [None]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)


In [17]:
df_loader = DataFrameLoader(df, page_content_column='review')

In [43]:
documents = df_loader.load()
documents[0]

Document(metadata={'recommendationid': '172440169', 'language': 'english', 'game': 'cs2'}, page_content="terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands while real cheaters are not getting banned\n- no new content for months\n- no new operation (it's been 3+ years)\n- tons of competitive/wingman/hostage maps missing from csgo\n- deleted achievements from csgo with 1 pointless one just make new ones\n- no danger zone\n- no team deatmatch\n- sub tick is garbage all we wanted was 128tick servers\n- spaghetti net code\n- no short mm\n- no flying scoutsman\n- replays are bugged\n- no overwatch\n- performance issues\n- peekers advantage\n- bad ranking system\n- no economy changes despite MR12\n- movement and shooting is worse compared to csgo\n\nAll of this while they make hundreds of millions of dollars from cases/keys and yet are unable to make any fun new content unbelievable, and yet they waste their time with boring hero shooter deadlo

In [44]:
docs = text_splitter.split_documents(documents)

In [47]:
print(docs[0])
print(docs[1])

page_content='terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands' metadata={'recommendationid': '172440169', 'language': 'english', 'game': 'cs2'}
page_content='commands while real cheaters are not getting banned' metadata={'recommendationid': '172440169', 'language': 'english', 'game': 'cs2'}


In [None]:
page_content

In [66]:
collection = client.get_or_create_collection("reviews_chunck", metadata={"hnsw:space": "cosine"})

In [67]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]



In [68]:
texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs]

In [69]:
print(len(texts))
print(len(metadatas))
print(len(uuids))

2972
2972
2972


In [70]:
print(texts[0])
print(metadatas[0])
print(uuids[0])

print(texts[1])
print(metadatas[1])
print(uuids[1])

terrible anti cheat people are getting false bans for high sensitivity/amd drivers/console commands
{'recommendationid': '172440169', 'language': 'english', 'game': 'cs2'}
2eeba8ab-3963-4c61-8ce5-52942842c96b
commands while real cheaters are not getting banned
{'recommendationid': '172440169', 'language': 'english', 'game': 'cs2'}
df8e6d80-0c63-4b6a-a7b7-b365a1e03c55


In [72]:
collection.add(
    documents=texts,
    metadatas=metadatas,
    ids=uuids,
)

In [73]:
collection.count()

2972

In [74]:
def search(query:str, game:str = None) -> list[str]:
    results = collection.query(
    query_texts=[query],
    n_results=10,
    # where={"game": game}
)
    return results

In [76]:
results = search('What are the chances of getting a false ban in this game?')

In [77]:
results

{'ids': [['3e88d6dc-b598-4808-89a6-8a22eec5400d',
   '48f330b0-a347-406d-88cd-50ac424f381a',
   '6602fd7a-d78f-44a3-9ba9-f644c8cd0b8b',
   'a41cb42e-9365-40d4-a0d1-6311767e7806',
   '3de3c433-3f80-4fd6-8786-4ee7d01adf8f',
   '6101474a-21f0-4075-b2a2-ffd28d54360c',
   'df8e6d80-0c63-4b6a-a7b7-b365a1e03c55',
   '9bb0f549-43a9-4cb1-9b61-aecb6a53d4fc',
   '2eeba8ab-3963-4c61-8ce5-52942842c96b',
   '117b0a97-e075-43a7-bdc2-70a114d7d99c']],
 'distances': [[0.44741660356521606,
   0.4779500365257263,
   0.4898596405982971,
   0.5022164583206177,
   0.5406852960586548,
   0.5500341653823853,
   0.5515846014022827,
   0.573614239692688,
   0.5768454670906067,
   0.5787902474403381]],
 'metadatas': [[{'game': 'dota2',
    'language': 'english',
    'recommendationid': '171924455'},
   {'game': 'cs2', 'language': 'english', 'recommendationid': '174059061'},
   {'game': 'dota2', 'language': 'english', 'recommendationid': '171924455'},
   {'game': 'dota2', 'language': 'english', 'recommendationid':