In [2]:
import pandas as pd
import nltk
# nltk.download('all')
import re
import collections

In [2]:
df = pd.read_csv("cp2077_reviews.csv.zip", compression="zip")
df["Review"] = df["Review"].astype("str")
df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,"Been here since day 1, and I am someone who ca...",True,1645046263
1,Had to replay this immediately to demolish Ada...,True,1663224196
2,Patch 1.5 fixed everything for me. \nThe quest...,True,1645267750
3,Watches Edgerunners -> Downloads Cyberpunk 207...,True,1667117035
4,I remember hearing about Cyberpunk 2077 around...,True,1664423074
...,...,...,...
16594,boobs :D,True,1608792512
16595,"+ The great main story, cyberpunk atmosphere a...",True,1608792267
16596,If you got the hardware to run this game you d...,True,1608777643
16597,the glitches in my experience have not been en...,True,1608777582


In [3]:
# use regex to remove punctuation and emojis
df["Review"] = df["Review"].apply(lambda x: re.sub(r"[^-9A-Za-z ]", "", x))

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
# df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords]))

# remove numbers
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\d+", "", x))

# lower case all words
df["Review"] = df["Review"].apply(lambda x: x.lower())

# remove short words
df["Review"] = df["Review"].apply(lambda x: " ".join([word for word in x.split() if len(word) > 2]))

# remove extra spaces
df["Review"] = df["Review"].apply(lambda x: re.sub(r"\s+", " ", x))

# game name is in the review, remove it
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyber punk ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk 2077 ", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077game", ""))
df["Review"] = df["Review"].apply(lambda x: x.replace("cyberpunk2077 game", ""))

# remove empty reviews
df = df.loc[df["Review"] != "",:]

df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,been here since day and someone who came from ...,True,1645046263
1,had replay this immediately demolish adam smas...,True,1663224196
2,patch fixed everything for the quests make mor...,True,1645267750
3,watches edgerunners downloads has the time hi...,True,1667117035
4,remember hearing about around when was announ...,True,1664423074
...,...,...,...
16594,boobs,True,1608792512
16595,the great main story atmosphere and good visu...,True,1608792267
16596,you got the hardware run this game you definit...,True,1608777643
16597,the glitches experience have not been enough h...,True,1608777582


In [4]:
# keep only 5250 reviews for each based on 'Recommended or Not Recommended" column
df_recom = df.loc[df["Recommended or Not Recommended"] == True,:]
df_recom = df_recom.sample(5250).reset_index(drop=True)

df_not_recom = df.loc[df["Recommended or Not Recommended"] == False,:]
df_not_recom = df_not_recom.sample(5250).reset_index(drop=True)

df_balance = pd.concat([df_recom, df_not_recom], axis=0).reset_index(drop=True)
df_balance

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,bugs amazing world gameplay,True,1607648436
1,buddy got game bit ago say chicks dicks think ...,True,1608091365
2,far perfect game understand talking people wh...,True,1658359768
3,just get good stop complaining,True,1608091574
4,those complain game glitchy havent played star...,True,1623440083
...,...,...,...
10495,okey game quite nice systems honestly really n...,False,1607665805
10496,this game makes sad,False,1608948910
10497,even psi fairly decent machine nothing spectac...,False,1608136933
10498,first experienced visual bugs characters guns ...,False,1607612500


In [5]:
bow_recom = collections.Counter([y for x in df_recom.Review for y in x.split()])
bow_not_recom = collections.Counter([y for x in df_not_recom.Review for y in x.split()])

# make a dataframe of the words and their counts
df_bow_recom = pd.DataFrame(bow_recom.most_common(), columns=["Word", "Count"])
df_bow_not_recom = pd.DataFrame(bow_not_recom.most_common(), columns=["Word", "Count"])

In [8]:
df_not_recom

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,want like game really thought would take day b...,False,1608419225
1,bought game sale hearing itd miraculously upda...,False,1660149112
2,while love game aesthetics honestly recommend ...,False,1607625506
3,ive pretty reserved making comments far since ...,False,1607890409
4,aaa early access game company spend much money...,False,1616677596
...,...,...,...
5245,okey game quite nice systems honestly really n...,False,1607665805
5246,this game makes sad,False,1608948910
5247,even psi fairly decent machine nothing spectac...,False,1608136933
5248,first experienced visual bugs characters guns ...,False,1607612500


In [6]:
# export to csv
df_bow_recom.to_csv("bow_recom.csv", index=False)
df_bow_not_recom.to_csv("bow_not_recom.csv", index=False)

In [9]:
# tfidf
# importing libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
#stopword removal and lemmatization
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [10]:
combined = pd.read_csv("cp2077_reviews_cleaned.csv.zip", compression="zip")

# drop column Date Timestamp Created
combined = combined.drop(columns=["Date Timestamp Created"])

# make sure reviews are strings
combined["Review"] = combined["Review"].astype("str")

#split combined into train and test
train = combined.sample(frac=0.8, random_state=200).reset_index(drop=True)
test = combined.drop(train.index).reset_index(drop=True)

In [11]:
train_X_non = train['Review']   # '0' refers to the review text
train_y = train['Recommended or Not Recommended']   # '1' corresponds to Label (1 - positive and 0 - negative)
test_X_non = test['Review']
test_y = test['Recommended or Not Recommended']
train_X=[]
test_X=[]

In [12]:
#text pre processing for train data
for i in range(0, len(train_X_non)):
    review = re.sub('[^a-zA-Z]', ' ', train_X_non[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    train_X.append(review)

In [13]:
#text pre processing for test data
for i in range(0, len(test_X_non)):
    review = re.sub('[^a-zA-Z]', ' ', test_X_non[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    test_X.append(review)

In [14]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

In [15]:
print("n_samples: %d, n_features: %d" % X_train_tf.shape)


n_samples: 8400, n_features: 36887


In [16]:
#transforming test data into tf-idf matrix
X_test_tf = tf_idf.transform(test_X)

In [17]:
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 2100, n_features: 36887


In [18]:
#naive bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

In [19]:
#predicted y
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [20]:
print(metrics.classification_report(test_y, y_pred, target_names=['True', 'False']))


              precision    recall  f1-score   support

        True       1.00      0.94      0.97      2100
       False       0.00      0.00      0.00         0

    accuracy                           0.94      2100
   macro avg       0.50      0.47      0.49      2100
weighted avg       1.00      0.94      0.97      2100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[1979  121]
 [   0    0]]
