In [1]:
import pandas as pd
import nltk
import re
import collections

In [2]:
df = pd.read_csv("cp2077_reviews.csv.zip", compression="zip")
df["Review"] = df["Review"].astype("str")
df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,"Been here since day 1, and I am someone who ca...",True,1645046263
1,Had to replay this immediately to demolish Ada...,True,1663224196
2,Patch 1.5 fixed everything for me. \nThe quest...,True,1645267750
3,Watches Edgerunners -> Downloads Cyberpunk 207...,True,1667117035
4,I remember hearing about Cyberpunk 2077 around...,True,1664423074
...,...,...,...
16594,boobs :D,True,1608792512
16595,"+ The great main story, cyberpunk atmosphere a...",True,1608792267
16596,If you got the hardware to run this game you d...,True,1608777643
16597,the glitches in my experience have not been en...,True,1608777582


In [3]:
# use regex to remove punctuation and emojis
df["Review"] = df["Review"].apply(lambda x: re.sub(r"[^-9A-Za-z ]", "", x))

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

# remove numbers
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\d+", "", x))

# lower case all words
df["Review"] = df["Review"].apply(lambda x: x.lower())

# remove short words
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if len(word) > 2]))

# remove extra spaces
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\s+", " ", x))

# game name is in the review, remove it
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077game", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 game", ""))

# remove empty reviews
df = df.loc[df["Review"] != "",:]

df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,been since day someone came version game this ...,True,1645046263
1,had replay immediately demolish adam smasher,True,1663224196
2,patch fixed everything the quests make sense t...,True,1645267750
3,watches edgerunners downloads time life,True,1667117035
4,remember hearing around announced interested ...,True,1664423074
...,...,...,...
16594,boobs,True,1608792512
16595,the great main story atmosphere good visuals-...,True,1608792267
16596,got hardware run game definitely,True,1608777643
16597,glitches experience enough hold awesomeness ga...,True,1608777582


In [4]:
# keep only 5250 reviews for each based on 'Recommended or Not Recommended" column
df_recom = df.loc[df["Recommended or Not Recommended"] == True,:]
df_recom = df_recom.sample(5250).reset_index(drop=True)

df_not_recom = df.loc[df["Recommended or Not Recommended"] == False,:]
df_not_recom = df_not_recom.sample(5250).reset_index(drop=True)

df_balance = pd.concat([df_recom, df_not_recom], axis=0).reset_index(drop=True)
df_balance

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,this game great sure get better like the witch...,True,1641252319
1,you dont play game game plays mind feelings pr...,True,1607909547
2,just fast write updont trust others try yourse...,True,1607618737
3,still one favorite games pre-ordered didnt wan...,True,1634305330
4,best porn game ive ever played,True,1608307110
...,...,...,...
10495,game lot like body broken,False,1611528416
10496,cdpr this game gift holidays mother feel guilt...,False,1611071463
10497,buy game want light rpg production value shoot...,False,1607689049
10498,horrendous optimization think second this game...,False,1607565700


In [6]:
bow_recom = collections.Counter([y for x in df_recom.Review for y in x.split()])
bow_not_recom = collections.Counter([y for x in df_not_recom.Review for y in x.split()])

# make a dataframe of the words and their counts
df_bow_recom = pd.DataFrame(bow_recom.most_common(), columns=["Word", "Count"])
df_bow_not_recom = pd.DataFrame(bow_not_recom.most_common(), columns=["Word", "Count"])
df_not_recom

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,early access game year will probably masterpie...,False,1607593666
1,got d its still early access,False,1607857652
2,let start thisignoring fanboys saying game rpg...,False,1607780777
3,best joke,False,1607665433
4,buy gta instead,False,1616499900
...,...,...,...
5245,game lot like body broken,False,1611528416
5246,cdpr this game gift holidays mother feel guilt...,False,1611071463
5247,buy game want light rpg production value shoot...,False,1607689049
5248,horrendous optimization think second this game...,False,1607565700
