# Term Project

Your name: Ben Hoffman

You abc123: Peo042

Dataset: https://www.kaggle.com/dahlia25/metacritic-video-game-comments
combined with: https://www.kaggle.com/skateddu/metacritic-critic-games-reviews-20112019

## Submission Instructions

After completing the exercises below, generate a pdf of the code **with** outputs. After that create a zip file containing both the completed exercise and the generated PDF/HTML. You are **required** to check the PDF/HTML to make sure all the code **and** outputs are clearly visible and easy to read. If your code goes off the page, you should reduce the line size. I generally recommend not going over 80 characters.

Finally, name the zip file using a combination of your the assigment and your name, e.g., ps3_rios.zip

In [2]:
#Load in the dataset into a dataframe
import pandas as pd

df = pd.read_csv('metacritic_game_comments.csv')
df.columns = ["Title","Console","Rating","Review","User"]
df.head()

Unnamed: 0,Title,Console,Rating,Review,User
0,7554,PC,7,7554 Glorious Memories Revived is the first bi...,GamingXP
1,7554,PC,6,7554 deserves brownie points for trying to bre...,Absolute Games
2,7554,PC,4,Even $12 is too much to ask for what feels lik...,PC Gamer
3,7554,PC,4,"Nonetheless, keep your chin up, Emobi Games. T...",Eurogamer Germany
4,7554,PC,3,7554 is a modern curiosity that plays like a s...,GameSpot


In [3]:
#create new columns in dataframe

df['tokenized_text'] = ''
df['sentiment'] = 0

df.head()

Unnamed: 0,Title,Console,Rating,Review,User,tokenized_text,sentiment
0,7554,PC,7,7554 Glorious Memories Revived is the first bi...,GamingXP,,0
1,7554,PC,6,7554 deserves brownie points for trying to bre...,Absolute Games,,0
2,7554,PC,4,Even $12 is too much to ask for what feels lik...,PC Gamer,,0
3,7554,PC,4,"Nonetheless, keep your chin up, Emobi Games. T...",Eurogamer Germany,,0
4,7554,PC,3,7554 is a modern curiosity that plays like a s...,GameSpot,,0


In [4]:
#create a definitons that eliminates punctation
import string
import nltk
from nltk.corpus import stopwords

def nopun(text):
    no_pun = "".join(c for c in text if c not in string.punctuation)
    return no_pun

#remove stopwords and punctuation
features = []

for rev in df["Review"]:
    text = str(rev).lower()
    token = nltk.word_tokenize(text)
    
    stops = set(stopwords.words('english'))
    cleantok = [t for t in token if len(t.lower())>1 and 
                 (t.lower() not in stops)]
    
    clean = []
    
    for w in cleantok:
        cleant = nopun(w)
        if len(cleant.strip())>1:
            clean.append(cleant.strip())
    features.append(' '.join(clean))


In [5]:
#Add features into tokenized_text

df['tokenized_text'] = [line for line in features]

df.head()

Unnamed: 0,Title,Console,Rating,Review,User,tokenized_text,sentiment
0,7554,PC,7,7554 Glorious Memories Revived is the first bi...,GamingXP,7554 glorious memories revived first big game ...,0
1,7554,PC,6,7554 deserves brownie points for trying to bre...,Absolute Games,7554 deserves brownie points trying break mold...,0
2,7554,PC,4,Even $12 is too much to ask for what feels lik...,PC Gamer,even 12 much ask feels like halfhearted commun...,0
3,7554,PC,4,"Nonetheless, keep your chin up, Emobi Games. T...",Eurogamer Germany,nonetheless keep chin emobi games first step m...,0
4,7554,PC,3,7554 is a modern curiosity that plays like a s...,GameSpot,7554 modern curiosity plays like shabby relic,0


In [6]:
#Load sentiment word sets
pos = []
neg = []

posfile = list(open('positive-words.txt', encoding = 'utf-8'))
negfile = list(open('negative-words.txt', encoding='iso-8859-1'))

for p in posfile:
    pos.append(p.strip())

for n in negfile:
    neg.append(n.strip())


In [7]:
#Write definitions for use in sentiment analysis
def count_pos_words(sentence):
        num_pos_words = 0
        for word in sentence.lower().split():
            if word in pos:
                num_pos_words += 1
        return num_pos_words

def count_neg_words(sentence):
        num_neg_words = 0
        for word in sentence.lower().split():
            if word in neg:
                num_neg_words += 1
        return num_neg_words

In [8]:
#Determine sentiment score
score = 0
sentiment = []

for f in features:
    score = count_pos_words(f)-count_neg_words(f)
    sentiment.append(score)

In [9]:
#Merge sentiment into dataframe
df['sentiment'] = [line for line in sentiment]

df.head()

Unnamed: 0,Title,Console,Rating,Review,User,tokenized_text,sentiment
0,7554,PC,7,7554 Glorious Memories Revived is the first bi...,GamingXP,7554 glorious memories revived first big game ...,2
1,7554,PC,6,7554 deserves brownie points for trying to bre...,Absolute Games,7554 deserves brownie points trying break mold...,-1
2,7554,PC,4,Even $12 is too much to ask for what feels lik...,PC Gamer,even 12 much ask feels like halfhearted commun...,0
3,7554,PC,4,"Nonetheless, keep your chin up, Emobi Games. T...",Eurogamer Germany,nonetheless keep chin emobi games first step m...,1
4,7554,PC,3,7554 is a modern curiosity that plays like a s...,GameSpot,7554 modern curiosity plays like shabby relic,1


In [10]:
#Split train (80%) test (20%)
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df[['sentiment']],
                                                df['Rating'], test_size = .2, random_state = 0)

In [10]:
#Train model using Random Forest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(xtrain, ytrain)

RandomForestClassifier()

In [11]:
#Predict and Assess predictions
from sklearn.metrics import precision_score, recall_score, f1_score

rfcpred = rfc.predict(xtest)

rfc_precision = precision_score(ytest, rfcpred, average = 'micro')
rfc_recall = recall_score(ytest, rfcpred, average= 'micro')
rfc_f1 = f1_score(ytest, rfcpred, average= 'micro')

In [12]:
#Print results
print(rfc_precision)
print(rfc_recall)
print(rfc_f1)

0.2972234421509784
0.2972234421509784
0.2972234421509784


In [13]:
#Train using KNN
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.metrics import precision_score, recall_score, f1_score

knn = KNC(n_neighbors = 5)

knn.fit(xtrain, ytrain)

knnpred = knn.predict(xtest)

knn_precision = precision_score(ytest, knnpred, average = 'micro')
knn_recall = recall_score(ytest, knnpred, average= 'micro')
knn_f1 = f1_score(ytest, knnpred, average= 'micro')

In [14]:
#Print results
print(knn_precision)
print(knn_recall)
print(knn_f1)

0.24003318206216756
0.24003318206216756
0.24003318206216756


In [13]:
#Set up a Vectorizer on tokenized_text
from sklearn.feature_extraction.text import TfidfVectorizer as TV

vectorizer = TV(max_features = 2500, stop_words = stopwords.words('english'))

processedvec = vectorizer.fit_transform(features).toarray()

In [14]:
#Split the data set after the Vectorizer

xtrain2, xtest2, ytrain2, ytest2 = train_test_split(processedvec,
                                                df['Rating'], test_size = .2, random_state = 0)


In [15]:
#Rerun the RFC using the new train and test

rfcvec = RandomForestClassifier()

rfcvec.fit(xtrain2, ytrain2)

RandomForestClassifier()

In [16]:
#Predict and assess on the RFCvec

predvec = rfcvec.predict(xtest2)

vec_precision = precision_score(ytest2, predvec, average = 'micro')
vec_recall = recall_score(ytest2, predvec, average= 'micro')
vec_f1 = f1_score(ytest2, predvec, average= 'micro')

In [17]:
#Print results
print(vec_precision)
print(vec_recall)
print(vec_f1)

0.4107744107744108
0.4107744107744108
0.4107744107744108


In [22]:
#Set up a CountVectorizer on features
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

CV = CountVectorizer(ngram_range = (1,1))

cvfeat = CV.fit_transform(features)


In [27]:
#split the CV set
xtrain3, xtest3, ytrain3, ytest3 = train_test_split(cvfeat,
                                                df['Rating'], test_size = .2, random_state = 0)

#fit the model
linsvc = GridSearchCV(LinearSVC(), param_grid={'C': [1, 100]}, 
                      scoring = 'f1_micro', cv = 5)
linsvc.fit(xtrain3, ytrain3)



GridSearchCV(cv=5, estimator=LinearSVC(), param_grid={'C': [1, 100]},
             scoring='f1_micro')

In [28]:
#make predictions using the model
svm_test_predictions = linsvc.predict(xtest3)

#score the model
svmprecision = precision_score(ytest3, svm_test_predictions, average= 'micro')
svmrecall = recall_score(ytest3, svm_test_predictions, average= 'micro')
svmf1 = f1_score(ytest3, svm_test_predictions, average= 'micro')

In [29]:
#Print results
print(svmprecision)
print(svmrecall)
print(svmf1)

0.40901771336553944
0.40901771336553944
0.40901771336553944


In [46]:
xtrain4, xtest4, ytrain4, ytest4 = train_test_split(df[['tokenized_text', 'Review']],
                                                df['Rating'], test_size = .2, random_state = 0)

In [47]:
#Set up tokens for Word2Vec
from gensim.models import Word2Vec

vec_tok = pd.Series(df['tokenized_text']).values

w2vmodel = Word2Vec(vec_tok, window = 3, min_count = 1, workers = 3, sg = 1, vector_size = 1000)

In [48]:
#Create word2vec vectors
with open('trainw2v.csv', 'w+') as outfile:
    for i, r in xtrain4.iterrows():
        vector = (np.mean([w2vmodel.wv[t] for t in r['tokenized_text']], axis = 0)).tolist()
        if i == 0:
            header = ','.join(str(ele) for ele in range(1000))
            outfile.write(header)
            outfile.write('\n')
        if type(vector) is list:
            line1 = ','.join([str(v) for v in vector])
        else:
            line1 = ','.join([str(0) for i in range(1000)])
        outfile.write(line1)
        outfile.write('\n')

In [49]:
#Set up test features
testfeat = []

for i, r in xtest4.iterrows():
    vector = (np.mean([w2vmodel.wv[t] for t in r['tokenized_text']], axis = 0)).tolist()
    if type(vector) is list:
          testfeat.append(vector)
    else:
         testfeat.append(np.array([0 for i in range(1000)]))


In [51]:
#Train model using Random Forest
from sklearn.ensemble import RandomForestClassifier

rfcw2v = RandomForestClassifier()

rfcmodel = pd.read_csv('trainw2v.csv')
rfcw2v.fit(rfcmodel, ytrain4)
w2vpred = rfcw2v.predict(testfeat)

In [52]:
#score the model
w2vprec = precision_score(ytest4, w2vpred, average= 'micro')
w2vrec = recall_score(ytest4, w2vpred, average= 'micro')
w2vf1 = f1_score(ytest4, w2vpred, average= 'micro')

In [53]:
#Print results
print(w2vprec)
print(w2vrec)
print(w2vf1)

0.3195969355389645
0.3195969355389645
0.3195969355389645
