In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from sys import path
from os.path import join



In [2]:
filePath = join(join(path[0], 'boardgamegeek-reviews'), 'bgg-13m-reviews.csv')

In [3]:
print(filePath)

C:\Users\mtn22\boardgamegeek-reviews\bgg-13m-reviews.csv


In [4]:
def loadData(path):
    d = pd.read_csv(path)
    d = d[['comment', 'rating']].copy()
    d = d.fillna('')
    
    return d

In [5]:
def formatRawData(rawData):
    rawData['comment'] = [c.strip().lower() for c in rawData['comment']]
    rawData['comment'] = [c if c.islower() else '' for c in rawData['comment']]
    
    #only take data with comments
    rawData = rawData[rawData['comment'].apply(lambda alpha: len(alpha) > 0)]
    #only take data with ratings 1 or greater
    rawData = rawData[rawData['rating'].apply(lambda alpha: float(alpha) >= 1)]
    rawData = rawData.sample(frac = 1).reset_index(drop = True)  
    
    rawData['rating'] = [float(rating) for rating in rawData['rating']]
    
    
    #for i in range(len(rawData)):
     #   words = re.split(r'\W+', rawData['comment'][i])
      #  stop_words = set(stopwords.words('english'))
       # words = [w for w in words if not w in stop_words]
        #rawData.loc[i, 'comment'] = ' '.join(map(str, words)).lower()
        
    return rawData

In [6]:
rawData = loadData(filePath)

print(rawData['comment'][0])
print(len(rawData))

data = formatRawData(rawData)

print(data['comment'][0])
print(len(data))


13170073
quickly becoming one of my favorite games. extremely simple to learn and play yet you are always left second guessing yourself and wondering how you could have played better. we always find ourselves discussing our strategies after every game trying to figure out what we might have done right or wrong. win or lose the game is just a pleasure to play and dissect while remaining extremely simple. the components are fun to trade and handle. will always be a part of the collection.
2592855


In [7]:
def divideData(data, train):
    data = data.sample(frac=1).reset_index(drop = True)
    #dataLen = len(data['comment'])
    dataLen = data.shape[0]
    
    trainData = data[:int(train * dataLen)]
    devData = data[int(train * dataLen):int((train + 1)/2 * dataLen)]
    testData = data[int((train + 1)/2 * dataLen):]
    
    trainData = trainData.sample(frac = 1).reset_index(drop = True)
    devData = devData.sample(frac = 1).reset_index(drop = True)
    testData = testData.sample(frac = 1).reset_index(drop = True)
    
    return [trainData, devData, testData]

In [8]:
trainData, devData, testData = divideData(data, 0.9).copy()

In [9]:
print(trainData.shape[0])
print(devData.shape[0])
print(testData.shape[0])

2333569
129643
129643


In [10]:
def vectorizeData(data, vectorizer):
    data_c = vectorizer.transform(data['comment'])
    data_r = np.asarray([int(r) if r%int(r) <= 0.5 else int(r + 1) for r in data['rating']])
    
    return [data_c, data_r]

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(trainData['comment'])
 train_c, train_r = vectorizeData(trainData, vectorizer).copy()
 test_c, test_r = vectorizeData(testData, vectorizer).copy()
 dev_c, dev_r = vectorizeData(devData, vectorizer).copy()

#joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [None]:
#hyper parameter tuning 
C = [0.1, 0.3, 0.5, 0.7, 0.9, 1]

for c in C:
    svm = LinearSVC(C = c)
    svm.fit(train_c, train_r)
    
    pred_r = svm.predict(dev_c)

    accuracy = accuracy_score(dev_r, pred_r)*100

    print('SVM accuracy for C={}: {:.5f}%\n'.format(c, accuracy))

In [12]:
svm = LinearSVC(C=0.1)
svm.fit(train_c, train_r)

In [13]:
pred_r = svm.predict(dev_c)
accuracy = accuracy_score(dev_r, pred_r)*100
print('SVM accuracy for C={}: {:.5f}%\n'.format(accuracy))

In [15]:
#Final Accuracy

predT_r = svm.predict(test_c)
accuracy = accuracy_score(test_r, predT_r)*100
print('SVM final accuracy: {:.5f}%\n'.format(accuracy))
    

SVM Final accuracy : 35.10318%
