In [1]:
import pandas as pd
import nltk
nltk.download('all')
import re
import collections

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /home/codespace/nltk_d

In [6]:
df = pd.read_csv("cp2077_reviews.csv.zip", compression="zip")
df["Review"] = df["Review"].astype("str")
df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,"Been here since day 1, and I am someone who ca...",True,1645046263
1,Had to replay this immediately to demolish Ada...,True,1663224196
2,Patch 1.5 fixed everything for me. \nThe quest...,True,1645267750
3,Watches Edgerunners -> Downloads Cyberpunk 207...,True,1667117035
4,I remember hearing about Cyberpunk 2077 around...,True,1664423074
...,...,...,...
16594,boobs :D,True,1608792512
16595,"+ The great main story, cyberpunk atmosphere a...",True,1608792267
16596,If you got the hardware to run this game you d...,True,1608777643
16597,the glitches in my experience have not been en...,True,1608777582


In [7]:
# use regex to remove punctuation and emojis
df["Review"] = df["Review"].apply(lambda x: re.sub(r"[^-9A-Za-z ]", "", x))

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

# remove numbers
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\d+", "", x))

# lower case all words
df["Review"] = df["Review"].apply(lambda x: x.lower())

# remove short words
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if len(word) > 2]))

# remove extra spaces
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\s+", " ", x))

# game name is in the review, remove it
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077game", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 game", ""))

# remove empty reviews
df = df.loc[df["Review"] != "",:]

df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,been since day someone came version game this ...,True,1645046263
1,had replay immediately demolish adam smasher,True,1663224196
2,patch fixed everything the quests make sense t...,True,1645267750
3,watches edgerunners downloads time life,True,1667117035
4,remember hearing around announced interested ...,True,1664423074
...,...,...,...
16594,boobs,True,1608792512
16595,the great main story atmosphere good visuals-...,True,1608792267
16596,got hardware run game definitely,True,1608777643
16597,glitches experience enough hold awesomeness ga...,True,1608777582


In [8]:
# keep only 5250 reviews for each based on 'Recommended or Not Recommended" column
df_recom = df.loc[df["Recommended or Not Recommended"] == True,:]
df_recom = df_recom.sample(5250).reset_index(drop=True)

df_not_recom = df.loc[df["Recommended or Not Recommended"] == False,:]
df_not_recom = df_not_recom.sample(5250).reset_index(drop=True)

df_balance = pd.concat([df_recom, df_not_recom], axis=0).reset_index(drop=True)
df_balance

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,thing beauty know will never fade away first p...,True,1667187932
1,chipping hhrhrive chosen nomad path disappoint...,True,1608341515
2,amazing game really enjoying the gun play grea...,True,1607736974
3,excited pointing finger sport steam everyone c...,True,1607903534
4,the legendary finally released,True,1607562202
...,...,...,...
10495,beautifully designed game there weight world l...,False,1656856452
10496,looks pretty cool playing bit open world rpg s...,False,1610574912
10497,massively misleading marketing game thought would,False,1621478095
10498,safe say game prime example needs change gamin...,False,1618901165


In [None]:
bow_recom = collections.Counter([y for x in df_recom.Review for y in x.split()])
bow_not_recom = collections.Counter([y for x in df_not_recom.Review for y in x.split()])

# make a dataframe of the words and their counts
df_bow_recom = pd.DataFrame(bow_recom.most_common(), columns=["Word", "Count"])
df_bow_not_recom = pd.DataFrame(bow_not_recom.most_common(), columns=["Word", "Count"])

In [None]:
# export to csv
df_bow_recom.to_csv("bow_recom.csv", index=False)
df_bow_not_recom.to_csv("bow_not_recom.csv", index=False)