In [1]:
import gzip 
from collections import defaultdict
import numpy as np
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
import csv
from surprise.model_selection import cross_validate
import pandas as pd
from surprise.model_selection import GridSearchCV

In [259]:
def readJSON(path):
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d

In [260]:
### Would-play baseline: just rank which games are popular and which are not, 
##and return '1' if a game is among the top-ranked

gameCount = defaultdict(int)
totalPlayed = 0 

for user, game, _ in readJSON("train.json.gz"):
    gameCount[game] += 1
    totalPlayed += 1 

In [261]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [262]:
mostPopular = [(gameCount[x], x ) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

In [263]:
gamesPerUser = defaultdict(set)
usersPerGame = defaultdict(set)

for user,game,d in readJSON("train.json.gz"):
    h = d['hours_transformed']
    gamesPerUser[user].add(game)
    usersPerGame[game].add(user)

In [264]:
users = set()
games = set() 

for user,game,d in readJSON("train.json.gz"): 
    users.add(user)
    games.add(game)

#### Preparing a validation of 50/50 

In [426]:
#making validation set with 50% played , 50% unplayed random
validation = []

for u, g , _ in hoursValid: 
    validation.append((u,g,1))
    unplayed = list(games - gamesPerUser[u])
    if unplayed :
        negative = random.choice(unplayed)
        validation.append((u , negative , 0))      

In [266]:
def get_acc(pred,validation):
    correct_pred = 0
    for (u, g, playedOrNot), predicted in zip(validation, predictions):
        if predicted == playedOrNot:
            correct_pred += 1
    accuracy = correct_pred / len(validation)

    return accuracy

In [267]:
def Jaccard(s1, s2):
    numer = len(set(s1).intersection(set(s2) ))
    denom =len(set(s1).union(set(s2)) )
    return numer/denom

##### Best-threshold for popularity

In [268]:
best_threshold = None
best_acc = 0
return_t = set()

for p in range(1, 100):  # Best percentage
    threshold = totalPlayed * (p / 100)
    count = 0
    
    for ic, i in mostPopular:
        count += ic
        return_t.add(i)
        if count > threshold:
            break

    correct_pred = 0
    for u, g, playedOrNot in validation:
        predicted = 1 if g in return_t else 0
        if predicted == playedOrNot:
            correct_pred += 1

    acc = correct_pred / len(validation)
    if acc > best_acc:
        best_acc = acc
        best_threshold = p

In [269]:
print(best_acc, best_threshold)

0.70355 66


In [270]:
popular_thresh = best_threshold

#### Testing on different threshold

In [271]:
def predict_sim(u,g,threshold):
    similarities = []
    for g1 in gamesPerUser[u]:
        if g1 == g: continue
        sim = Jaccard(usersPerGame[g1], usersPerGame[g])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append(sim)
    similarities.sort(reverse=True)
    if max(similarities,default = 0) > threshold:
        return True
    else:
        return False

In [272]:
def model(thresh):
    predictions = 0
    for u, g, _ in validation:
        if predict_sim(u,g, thresh)==True and _ != None:
            correct_pred += 1
        elif predict_sim(u,g, thresh)==False and _ == None:
            correct_pred += 1
        else:
            pass
    return correct_pred / len(validation)

In [273]:
def model_2thresh(thre1,thre2):
    returnthre = set()
    count = 0 
    
    for ic, i in mostPopular: 
        count += ic 
        returnthre.add(i)
        if count > totalPlayed* thre1 : break
            
    predictions = []
    for u, g, _ in validation:
        if g in returnthre or predict_sim(u,g, thre2) == True:
            predictions.append(True)
        else:
            predictions.append(False)
    return predictions

#### Finding the two best thresholds 

In [427]:
accuracies = []
thresholds = [] 
predictions = []
for i in range(62,68):
    for j in range(2,5):
        thre1 = i*0.01
        thre2 = j*0.01
        predictions = model_2thresh(thre1, thre2)
        acc = get_acc(predictions, validation)
        thresholds.append((thre1, thre2))
        accuracies.append(acc)

In [428]:
print(accuracies)
acc = max(accuracies)

[0.6428, 0.7567, 0.7559, 0.6425, 0.7553, 0.75605, 0.64205, 0.75375, 0.75495, 0.64175, 0.75235, 0.7546, 0.64135, 0.7508, 0.75375, 0.6408, 0.7475, 0.75215]


In [429]:
acc

0.7567

In [430]:
indx = accuracies.index(acc)
print(indx)

1


In [431]:
thre1, thre2 = thresholds[indx]
print(thre1, thre2)

0.62 0.03


In [472]:
thre1 = 0.63 #more stable
thre2 = 0.05

In [473]:
returnthre = set()
count = 0
for ic, i in mostPopular:
    count += ic
    returnthre.add(i)
    if count > totalPlayed*thre1: break

In [474]:
predictions = open("predictions_Played.csv", 'w')

for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    
    u,g = l.strip().split(',')
    
    if g in returnthre or predict_sim(u, g, thre2)==True:
        predictions.write(u + ',' + g + ",1\n")
    else:
        predictions.write(u + ',' + g + ",0\n")

predictions.close()

In [471]:
### Time-played baseline: compute averages for each user, 
##or return the global average if we've never seen the user before

allHours = []
userHours = defaultdict(list)

for user,game,d in readJSON("train.json.gz"):
  h = d['hours_transformed']
  allHours.append(h)
  userHours[user].append(h)

globalAverage = sum(allHours) / len(allHours)
userAverage = {}
for u in userHours:
  userAverage[u] = sum(userHours[u]) / len(userHours[u])
    

In [44]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [45]:
hoursPerUser = defaultdict(set)
hoursPerItem = defaultdict(set)

for user, game , d in allHours:
    hoursPerUser[user].add((game, d['hours_transformed']))
    hoursPerItem[game].add((user, d['hours_transformed']))

In [46]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] = 0

for g in hoursPerItem:
    betaI[g] = 0

In [48]:
Ntrain = len(hoursTrain)
users = list(hoursPerUser.keys() )
items = list(hoursPerItem.keys() )

In [195]:
def calculate_loss(data, alpha, betaU, betaI):
    total_loss = 0
    for u, i, r in data:
        prediction = alpha + betaU[u] + betaI[i]
        total_loss += (r - prediction) ** 2
    return total_loss / len(data)

In [359]:
def iterate(lamb):
    
    for itr in range(1000):
        # Calculate alpha
        alpha_term = globalAverage
        newalpha_term = 0
        for u, i, r in hoursTrain:
            newalpha_term +=  r['hours_transformed'] - (betaU[u] + betaI[i])
        alpha = newalpha_term / Ntrain

        # Calculate beta_u
        for u in hoursPerUser:
            beta_u_term = 0.0
            for i, r in hoursPerUser[u]:
                beta_u_term += r - (alpha+ betaI[i])
            betaU[u] = beta_u_term / (lamb + len(hoursPerUser[u]) )

        # Calculate beta_i
        for i in hoursPerItem:
            beta_i_term = 0.0
            for u, r in hoursPerItem[i]:
                beta_i_term += r - (alpha+ betaU[u])
            betaI[i] = beta_i_term / (lamb + len(hoursPerItem[i]) )
        
        current_loss = 0.0
        for u, i, r in hoursTrain:
            current_loss += (r['hours_transformed'] - (alpha + betaU[u] + betaI[i]) )**2
        
        regularizer = 0
        for u in betaU :
            regularizer += betaU[u]**2
        for i in betaI :
            regularizer += betaI[i]**2

        current_loss /= len(hoursTrain)

    return current_loss, current_loss + lamb*regularizer

In [391]:
mse,objective = iterate(5)
newMSE,newObjective = iterate(5)
iterations = 2

#### Using best lambda which is 5

In [392]:
while iterations < 7 or objective - newObjective > 0.00001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(5)
    iterations += 1

#### SVD 

In [388]:
data = [(user, item, rating) for user, item, rating in validation]

reader = Reader(line_format='user item rating', sep='\t')
dataset = Dataset.load_from_df(pd.DataFrame(data, columns=['user', 'item', 'rating']), reader)

trainset, valset = train_test_split(dataset, test_size=0.2)

In [351]:
param_grid = {'n_epochs': [5, 10, 15, 20, 25], 'lr_all': [0.002, 0.005, 0.01, 0.1],
              'reg_all': [0.02, 0.1, 0.2, 0.5], 'n_factors': [1, 2, 3]}

svd = SVD()
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(dataset)  # Use the original dataset, not trainset
best_params = grid_search.best_params['rmse']
final_model = SVD(**best_params)
final_model.fit(trainset)

# Make predictions on the validation set
val_predictions = final_model.test(valset)
val_mse = accuracy.mse(val_predictions)


MSE: 0.5122


In [352]:
best_params

{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.02, 'n_factors': 1}

In [389]:
model=SVD(n_factors=2,n_epochs=5,lr_all=0.002,reg_pu=1e-3,reg_qi=1e-3)
model.fit(trainset)
predictions = model.test(testset)
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

see=(sse / len(predictions))
see
final_model = model

In [393]:
predictions = open('predictions_Hours.csv', 'w')
for l in open("pairs_hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if g in betaI:
        bi = betaI[g]
        
    try:
        inner_uid = trainset.to_inner_uid(u)
        latent_u = final_model.pu[inner_uid]
    except ValueError:  # user u not in trainset
        latent_u = np.zeros(final_model.n_factors)

    try:
        inner_iid = trainset.to_inner_iid(g)
        latent_i = final_model.qi[inner_iid]
    except ValueError:  # item g not in trainset
        latent_i = np.zeros(final_model.n_factors)
       
    _ = predictions.write(u + ',' + g + ',' + str(alpha + bu + bi+ np.dot(latent_u, latent_i)) + '\n')
  
predictions.close()