In [1]:
#matrix factorization
import pandas as pd
import numpy as np
import implicit as im
from scipy.sparse import csr_matrix
import random

In [2]:
#read data
proc_dir = "../data/processed"

customers = pd.read_csv(proc_dir+"/customers.csv")
customers = customers['user']
games = pd.read_csv(proc_dir+"/games.csv")
games = games['game']
hours = pd.read_csv(proc_dir+"/hours.csv")
hours = hours[['userIndex', 'gameID', 'hours']]

In [83]:
def getPopularSampleData(users, items, ratings, size, rseed=43):
    #create list of popular items
    ratings['userIndex'] = pd.to_numeric(ratings['userIndex'])
    itemsGrouped = ratings.groupby('gameID', as_index=False).agg({"hours": "count"})
    popularItems = itemsGrouped[itemsGrouped.hours > 50]
    popularItems = popularItems['gameID']
    random.seed(rseed)
    randomSampleItems = map(int, random.sample(popularItems,size))
    #get all users who rated this item
    userRatedItems = ratings[ratings.gameID.isin(randomSampleItems)]
    sampleUsers = map(int, userRatedItems['userIndex'].unique())
    #return sample users and items
    return sampleUsers, randomSampleItems, userRatedItems

def getTrainTestData(ratings, percent=.2, rseed=43):
    random.seed(rseed)
    testCount = int(len(ratings)*percent)
    testRows = random.sample(range(0,len(ratings)-1),testCount)
    trainRatings = pd.DataFrame(columns=['userIndex','gameID','hours'])
    testRatings = pd.DataFrame(columns=['userIndex','gameID','hours'])
    ind=0
    for index, row in ratings.iterrows():
        if(ind in testRows):
            testRatings = testRatings.append({'userIndex':row['userIndex'],
                                              'gameID':row['gameID'],
                                              'hours':row['hours']}, ignore_index=True)
        else:
            trainRatings = trainRatings.append({'userIndex':row['userIndex'],
                                                'gameID':row['gameID'],
                                                'hours':row['hours']}, ignore_index=True)
        ind=ind+1
    return trainRatings, testRatings

def squaredError(users, items, R, testRatings):
    error=0
    for index, row in testRatings.iterrows():
        error+=(R[users.index(row['userIndex'])][items.index(row['gameID'])]-row['hours'])**2
    return error

In [84]:
#get the random sample with designated size
users, items, ratings = getPopularSampleData(customers, games, hours, 75, 43)
#get train and test data
trainRatings, testRatings = getTrainTestData(ratings)
print 'train ratings:', len(trainRatings)
print 'test ratings:', len(testRatings)

train ratings: 7749
test ratings: 1937


In [85]:
#create sparse train matrix games by customers with ratings - hours
M=np.zeros((len(items),len(users)))

for index, row in trainRatings.iterrows():
    M[items.index(row['gameID'])][users.index(row['userIndex'])]=row['hours']

M = csr_matrix(M)

print M.shape

(75, 3270)


In [86]:
#matrix factorization model
model = im.als.AlternatingLeastSquares(factors=15)
model.fit(M)

#factored U users and V items matrices
U = model.user_factors
V = model.item_factors

#user_games = playM.T.tocsr()
#recommendations = model.recommend(3, user_games,10)

100%|██████████| 15.0/15 [00:00<00:00, 69.23it/s]


In [87]:
#learned recommendations
R = np.matmul(U,np.transpose(V))

print 'U', U.shape, ',', 'V', V.shape, ',', 'R', R.shape

U (3270, 15) , V (75, 15) , R (3270, 75)


In [88]:
#get accuracy from test data
print squaredError(users, items, R, testRatings)


71282146.1954
