In [6]:
import pandas as pd
import nltk
# nltk.download('all')
import re
import numpy as np
import collections
import sklearn as sk
import string
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

In [7]:
df = pd.read_csv("../A_Source_Data/cp2077_reviews.csv.zip", compression="zip")
df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
df["Review"] = df["Review"].astype("str")
df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created
0,"Been here since day 1, and I am someone who ca...",True,1645046263
1,Had to replay this immediately to demolish Ada...,True,1663224196
2,Patch 1.5 fixed everything for me. \nThe quest...,True,1645267750
3,Watches Edgerunners -> Downloads Cyberpunk 207...,True,1667117035
4,I remember hearing about Cyberpunk 2077 around...,True,1664423074
...,...,...,...
16594,boobs :D,True,1608792512
16595,"+ The great main story, cyberpunk atmosphere a...",True,1608792267
16596,If you got the hardware to run this game you d...,True,1608777643
16597,the glitches in my experience have not been en...,True,1608777582


In [8]:
def data_cleaning(raw_data):
    raw_data = raw_data.translate(str.maketrans('', '', string.punctuation + string.digits)) 
    words = raw_data.lower().split() 
    # stops = set(stopwords.words( "english")) 
    stops = set(stopwords)
    # stops = stopwords.words('english')
    useful_words = [w for w in words if not w in stops]
    # use regex to change any word with cyberpunk in it to cyberpunk
    useful_words = [re.sub(r'\b.*cyberpunk.*\b', 'cyberpunk', w) for w in useful_words]
    useful_words = [re.sub(r'c.{6,8}k', 'cyberpunk', w) for w in useful_words]
    useful_words = [w.replace('cyber punk', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunks', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyber punk ', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk2077', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk 2077', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk2077 ', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk 2077 ', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk2077game', 'cyberpunk') for w in useful_words]
    useful_words = [w.replace('cyberpunk2077 game', 'cyberpunk') for w in useful_words]
    useful_words = [re.sub(r'[^\w\s]','',w) for w in useful_words]
    return( " ".join(useful_words))

In [9]:
df['Review']=df['Review'].apply(data_cleaning) 

In [10]:
# make a set_column to count the number of words in each review
df["set_column"] = df["Review"].apply(lambda x: set(x.split()))
df

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created,set_column
0,day someone came ps version game quite journey...,True,1645046263,"{distributed, styles, perfect, retrieval, make..."
1,replay immediately demolish adam smasher,True,1663224196,"{replay, demolish, immediately, adam, smasher}"
2,patch fixed everything quests make sense fixer...,True,1645267750,"{lot, katana, now, mini, voice, crash, alive, ..."
3,watches edgerunners downloads cyberpunk time life,True,1667117035,"{cyberpunk, life, edgerunners, downloads, watc..."
4,remember hearing cyberpunk around announced in...,True,1664423074,"{hunk, level, make, trigger, keanu, played, la..."
...,...,...,...,...
16594,boobs d,True,1608792512,"{boobs, d}"
16595,great main story cyberpunk atmosphere good vis...,True,1608792267,"{atmosphere, good, story, boring, main, arc, s..."
16596,got hardware run game definitely,True,1608777643,"{definitely, got, game, hardware, run}"
16597,glitches experience enough hold awesomeness ga...,True,1608777582,"{hold, enough, experience, patches, game, this..."


In [11]:
# keep only 5250 reviews for each based on 'Recommended or Not Recommended" column
temp_recom = df.loc[df["Recommended or Not Recommended"] == True,:]
temp_recom = temp_recom.sample(5250).reset_index(drop=True)

temp_not_recom = df.loc[df["Recommended or Not Recommended"] == False,:]
temp_not_recom = temp_not_recom.sample(5250).reset_index(drop=True)

# split df_recom into train and test
df_recom = temp_recom.sample(frac=0.33, random_state=150).reset_index(drop=True)
test_recom = temp_recom.drop(df_recom.index).reset_index(drop=True)
print(f"train size: {len(df_recom)}, test size: {len(test_recom)}")

# split df_not_recom into train and test
df_not_recom = temp_not_recom.sample(frac=0.33, random_state=150).reset_index(drop=True)
test_not_recom = temp_not_recom.drop(df_not_recom.index).reset_index(drop=True)
print(f"train size: {len(df_not_recom)}, test size: {len(test_not_recom)}")

# combine test_recom and test_not_recom
test = pd.concat([test_recom, test_not_recom], axis=0).reset_index(drop=True)

train size: 1732, test size: 3518
train size: 1732, test size: 3518


In [71]:
# This is to make the set dictionary
# count how many times each word appears in the reviews
bow_recom_temp = collections.Counter([y for x in df_recom.set_column for y in x])
bow_not_recom_temp = collections.Counter([y for x in df_not_recom.set_column for y in x])

bow_recom_set = dict(bow_recom_temp)
bow_not_recom_set = dict(bow_not_recom_temp)

for key in bow_recom_set:
    if key not in bow_not_recom_set:
        bow_not_recom_set[key] = 0
        pass
    else:
        pass

for key in bow_not_recom_set:
    if key not in bow_recom_set:
        bow_recom_set[key] = 0
        pass
    else:
        pass

for key in bow_recom_set:
    bow_recom_set[key] += 1

for key in bow_not_recom_set:
    bow_not_recom_set[key] += 1


In [72]:
# This is to make the list dictionary
# count how many times each word appears in the reviews
bow_recom = collections.Counter([y for x in df_recom.Review for y in x.split()])
bow_not_recom = collections.Counter([y for x in df_not_recom.Review for y in x.split()])

bow_recom_dict = dict(bow_recom)
bow_not_recom_dict = dict(bow_not_recom)

for key in bow_recom_dict:
    if key not in bow_not_recom_dict:
        bow_not_recom_dict[key] = 0
        pass
    else:
        pass

for key in bow_not_recom_dict:
    if key not in bow_recom_dict:
        bow_recom_dict[key] = 0
        pass
    else:
        pass

for key in bow_recom_dict:
    bow_recom_dict[key] += 1

for key in bow_not_recom_dict:
    bow_not_recom_dict[key] += 1
    

In [73]:
bow_recom_sum_vals = sum(bow_recom_dict.values())
bow_not_recom_sum_vals = sum(bow_not_recom_dict.values())

for i in bow_recom_dict:

    bow_recom_dict[i] /= bow_recom_sum_vals
    # bow_recom_dict[i] = float(format(bow_recom_dict[i], '.6f'))

for i in bow_not_recom_dict:

    bow_not_recom_dict[i] /= bow_not_recom_sum_vals
    # bow_not_recom_dict[i] = float(format(bow_not_recom_dict[i], '.6f'))

bow_recom_dict

{'game': 0.033217431617987944,
 'fun': 0.003129346314325452,
 'solid': 0.0007340441971874517,
 'experience': 0.0025343841755524647,
 'poorly': 0.00016998918250656778,
 'optimized': 0.00016998918250656778,
 'older': 0.00013908205841446454,
 'tech': 0.00020089630659867098,
 'ik': 0.00010044815329933549,
 'hz': 5.408746716118065e-05,
 'ti': 0.0002936176788749807,
 'average': 0.0008190387884407356,
 'fps': 0.0015839901097202905,
 'dense': 0.0001081749343223613,
 'city': 0.003932931540720136,
 'drops': 0.0001854427445526194,
 's': 0.00026271055478287744,
 'rig': 0.00017771596352959357,
 'run': 0.0014758151753979291,
 'right': 0.0011512903724308454,
 'now': 0.002797094730335342,
 'issues': 0.0018544274455261937,
 'except': 0.00024725699273682586,
 'careful': 3.090712409210323e-05,
 'going': 0.0014140009272137228,
 'know': 0.0014294544892597743,
 'requires': 0.00015453562046051616,
 'beef': 3.090712409210323e-05,
 'constant': 0.0001081749343223613,
 'best': 0.0022407664966774842,
 'decade': 8

In [74]:
assert len(bow_recom_dict) == len(bow_not_recom_dict)
assert len(bow_recom_set) == len(bow_not_recom_set)

In [75]:
def define_sentiment(element,test=False):
    """This function takes a review and returns the label for that review"""
    full_shape = df.shape[0]
    positive_review_probabiliy = len(df_recom)/full_shape
    negative_review_probabiliy = len(df_not_recom)/full_shape
    df_choices_positive = [positive_review_probabiliy, bow_recom_set, bow_recom_dict, df_recom]
    df_choices_negative = [negative_review_probabiliy ,bow_not_recom_set, bow_not_recom_dict, df_not_recom]
    def classifier(element, your_class = 'positive'):
        if your_class == 'positive':
            df_choices = df_choices_positive
        else:
            df_choices = df_choices_negative
        prob_of_class = df_choices[0]/full_shape
        score = 1 * prob_of_class
        # score = 0.1
        for i in element.split():
            if i not in df_choices[2].keys():
                pass
            else:
                prob_word_given_class = (df_choices[2])[i]
                # Almost the same value, given our spin on this application. 
                # Normally, this term frequency would be calculated differently across the positive and negative documents
                # but we are only looking at the reviews as the documents themselves to determine a word's relevance in the positive
                # or negative corpus. 
                tf = np.log(prob_word_given_class)
                # tf = abs(np.log(prob_word_given_class))
                tf = float(format(tf, '.12f'))
                # The IDF is the number of reviews / the number of reviews that contain the word in that given corpus
                # idf = abs(np.log(df_choices[3].shape[0]/(df_choices[1])[i]))
                idf = np.log(df_choices[3].shape[0]/(df_choices[1])[i])
                idf = float(format(idf, '.12f'))
                # score *= prob_word_given_class*tf*idf
                score *= prob_word_given_class*idf
        return score
    positive_score = classifier(element, 'positive')
    negative_score = classifier(element, 'negative')
    if test:
        # if positive_score
        return (positive_score, negative_score)
    if positive_score > negative_score:
        return True
    elif positive_score == negative_score:
        # Choosing an arbitrary value, because we assume that a review with one or few words of little substance
        # is implied to be negative, as is usual with netizens. 
        return False
    else:
        return False


In [76]:
test["Sentiment"] = test.Review.apply(define_sentiment)

In [77]:
test["Check"] = test.Review.apply(define_sentiment, test=True)

In [78]:
sum(test['Recommended or Not Recommended'] == test['Sentiment'])/test.shape[0]

0.7382035247299602

In [79]:
print(bow_recom_dict['great'], bow_not_recom_dict['great'])
print(bow_recom_set['great'], bow_not_recom_set['great'])

0.0036779477669602843 0.0020735492869088687
332 236


In [80]:
np.log(0.003678) 

-5.6053861528000075

In [81]:
test

Unnamed: 0,Review,Recommended or Not Recommended,Date Timestamp Created,set_column,Sentiment,Check
0,great,True,1607844181,{great},True,"(3.819201665993626e-08, 2.5980609835749516e-08)"
1,yknow game good core present issues fixed fuck...,True,1608236120,"{now, miracles, good, expecting, normal, fixed...",True,"(9.204345380915526e-116, 7.828795801261816e-117)"
2,almost year release started playing game first...,True,1637861521,"{now, gorgeous, stopped, quite, know, around, ...",True,"(5.892484484278359e-170, 1.1990712292255595e-172)"
3,best game game expected still good game extrem...,True,1609622134,"{lot, good, bugs, styles, technical, raytracin...",True,"(7.419419329629966e-166, 1.267011099850934e-171)"
4,dispite horrible state shipped fantastic game ...,True,1608494939,"{dispite, buy, fixed, sorted, shipped, state, ...",True,"(3.479209376685816e-58, 3.123882045815462e-59)"
...,...,...,...,...,...,...
7031,wanted game much visually amazing cool concept...,False,1609234934,"{concept, motivate, boring, given, soul, game,...",False,"(4.152061308216934e-61, 1.0646434635288367e-60)"
7032,hprimary concernh issue game way little conten...,False,1625734232,"{missions, make, keanu, rushed, literally, gra...",True,"(4.530928724216706e-306, 5.10642841758976e-309)"
7033,got promise wait buy sale assumption dlc will ...,False,1610497377,"{involved, good, restart, hopeful, will, every...",False,"(1.58937395319712e-204, 5.444845581435822e-203)"
7034,guess people really enjoyed depends expectatio...,False,1622574435,"{missions, pleasantly, short, make, mainly, br...",False,"(0.0, 0.0)"
