In [56]:
import gzip
from datetime import datetime
from collections import defaultdict
import math
import scipy
import scipy.optimize
from sklearn import svm
from sklearn import metrics
import numpy as np
import string
import random
import string
import json
import time
from datetime import timedelta
import requests
import re
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
import xlearn as xl

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def readCSV(path):
    #user,id,start_date,end_date,score,num_episodes_watched,status
    #status encoding: watching = 0, completed = 1, on_hold = 2, dropped = 3, plan_to_watch = 4
    f = open(path, 'r')
    
    for l in f:
        user,anime_id,start_date,end_date,score,num_episodes_watched,status = l.strip().split(',')
        anime_id=int(anime_id)
        score=int(score)
        num_episodes_watched=int(num_episodes_watched)
        status = int(status)
        yield user,anime_id,start_date,end_date,score,num_episodes_watched,status

In [4]:
#Get API KEY in case we need it
file = open("../data/mal_client_id.txt", 'r')
clientID = file.read()
file.close()

In [5]:
#our user data set
completed_with_date = []

for i in readCSV('../data/user_animelist_details_16mil.txt'):
    if (i[6] == 1 and i[3] != "" and i[4] != 0):
        completed_with_date.append(i)
random.shuffle(completed_with_date)

#our item data set
file = open("../data/anime_list_details.json", 'r')
animelist = json.loads(file.read())
animelistdict = dict()
for record in animelist:
    animelistdict[record['id']] = {}
    for key in record:
        #we don't need to use this key again
        if key == 'id':
            continue
        animelistdict[record['id']][key] = record[key]

In [259]:
#for simpler reccommender
X = [(entry[0], int(entry[1]), entry[3]) for entry in completed_with_date]
Y = [int(entry[4]) for entry in completed_with_date]
#dividing up dataset
XTrain = X[:len(X)//2]
XValid = X[len(X)//2:len(X)*3//4]
XTest = X[len(X)*3//4:]
YTrain = Y[:len(Y)//2]
YValid = Y[len(Y)//2:len(Y)*3//4]
YTest = Y[len(Y)*3//4:]

In [260]:
#data we are getting from our scored subset
scoresPerAnime = defaultdict(list)
scoresPerUser = defaultdict(list)
usersPerAnime = defaultdict(set)
animePerUser = defaultdict(set)
ratingsDict = defaultdict(list)

#changing this to be Xtrain!! - used to be userData
for index in range(len(XTrain)):
    i = XTrain[index]
    ratingsDict[(i[0],i[1])] = int(YTrain[index])
    scoresPerAnime[i[1]].append(int(YTrain[index]))
    scoresPerUser[i[0]].append(int(YTrain[index]))
    usersPerAnime[i[1]].add(i[0])
    animePerUser[i[0]].add(i[1])

In [261]:
#average score per anime
averageScorePerAnime = defaultdict(list)
for anime in scoresPerAnime:
    averageScorePerAnime[anime] = sum(scoresPerAnime[anime])/len(scoresPerAnime[anime])

In [262]:
numTests = 50000

In [263]:
#cache
pearsondict = dict()

# Between two items
def Pearson (i1 , i2):
    #for cache! this should hugely increase performance
    if (i1,i2) in pearsondict:
        return pearsondict[(i1,i2)]
    elif (i2,i1) in pearsondict:
        return pearsondict[(i2,i1)]
    
    #first we check if we have info on these
    if i1 not in scoresPerAnime or i2 not in scoresPerAnime:
        return 0
    
    #averages
    if len(scoresPerAnime[i1]) > 0 and len(scoresPerAnime[i2]) > 0:
        i1avg = averageScorePerAnime[i1]
        i2avg = averageScorePerAnime[i2]
    else:
        return 0
    
    #at this point, we know we have valid keys and we know that we have averages
    #now we just need to compute
    num = []
    denom1 = []
    denom2 = []
    for commonUser in usersPerAnime[i1].intersection(usersPerAnime[i2]):
        #if the sets are disjoint, we can just remove them, lol!
        if usersPerAnime[i1].isdisjoint(usersPerAnime[i2]):
            continue
            
        #num term
        num.append((ratingsDict[(commonUser,i1)] - i1avg) * (ratingsDict[(commonUser,i2)] - i2avg))
        
        #denom term 1
        denom1.append((ratingsDict[(commonUser,i1)] - i1avg)**2)
        
        #denom term 2
        denom2.append((ratingsDict[(commonUser,i2)] - i2avg)**2)
        
    #let's put it together
    num = sum(num)
    denom1 = sum(denom1)
    denom2 = sum(denom2)
    if denom1 == 0 or denom2 == 0:
        return 0
    pearson = num / (denom1 * denom2)
    
    #cache answer, then return it
    pearsondict[(i1, i2)] = pearson
    pearsondict[(i2, i1)] = pearson
    
    return pearson

In [264]:
#we train a linear regressor to fit a cubic equation to the timestamps.
#we generate a sub dataset, LOL. There will be no evaluation on this, except how well it works in the model
#the features will be number of days/1000 since first review date = x -> c + x + x^2 + x^3
startDate = datetime.strptime('1988-01-01', '%Y-%m-%d')

timeX = []
timeY = []
for i in range(len(XTrain)):
    timestamp = Xtrain[i][2]
    
    timearr = []
    #some date data is in a different format, can tell by length
    if(len(timestamp) == 10):
        tempDate = datetime.strptime(timestamp,'%Y-%m-%d')
    elif (len(timestamp) == 4):
        tempDate = datetime.strptime(timestamp,'%Y')
    elif (len(timestamp) == 7):
        tempDate = datetime.strptime(timestamp,'%Y-%m')
    else:
        #we don't know the date so we toss it
        continue
    # let's throw it out if its before our startDate
    if  tempDate < startDate:
        continue
        
    #now, let's get the time delta
    timeDelta = tempDate - startDate
    days = timeDelta.total_seconds() /(60 * 60 * 24)
    #normalizing it a bit...
    days = days/1000
    
    #here is our feature representation
    timeX.append([1, days, days**2, days**3])
    timeY.append(YTrain[i])

In [265]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(timeX, timeY)

In [266]:
weights = model.coef_
weights

array([ 8.32061145e+00, -6.68369365e-02,  3.91325354e-04, -4.06757756e-07])

In [280]:
def timestampWeight(timestamp):
    #some date data is in a different format, can tell by length
    if(len(timestamp) == 10):
        tempDate = datetime.strptime(timestamp,'%Y-%m-%d')
    elif (len(timestamp) == 4):
        tempDate = datetime.strptime(timestamp,'%Y')
    elif (len(timestamp) == 7):
        tempDate = datetime.strptime(timestamp,'%Y-%m')
    else:
        #we don't know the date so we toss it
        return 0
    # let's throw it out if its before our startDate, cuz its usually malformed
    if  tempDate < startDate:
        return 0
        
    #now, let's get the time delta
    timeDelta = tempDate - startDate
    days = timeDelta.total_seconds() /(60 * 60 * 24)
    #normalizing it a bit...
    days = days/1000
    #return weights[0] + weights[1]*days + weights[0]*days**2 + weights[0]*days**3
    return math.e**(days)

In [281]:
def predict(username, animeid, timestamp):
    denom = []
    ratingsForNum = []
    for anime in animePerUser[username]:
        #we cannot consider this exact task if we have it stored somewhere
        if anime == animeid:
            next
        ratingsForNum.append(ratingsDict[username,anime])
        denom.append(Pearson(animeid, anime) * timestampWeight(timestamp))
    num = [ratingsForNum[i] * denom[i] for i in range(len(denom))]
    #sometimes this will happen, especially for niche anime. let's just return the average score...
    if sum(denom) == 0:
        if animeid in averageScorePerAnime:
            return averageScorePerAnime[animeid]
        elif animeid in animelistdict and 'mean' in animelistdict[animeid]:
            return animelistdict[animeid]['mean']
        else:
            return 5
    #let's just project this into our output zone too
    result = sum(num)/sum(denom)
    if result > 10:
        result = 10
    elif result < 0:
        result = 0
    return result

In [282]:
def predictMultiple(X):
    yPred = []
    for i in X:
        yPred.append(predict(i[0], i[1], i[2]))
    return yPred

In [283]:
%%time
yPred = predictMultiple(XValid[:numTests])
metrics.mean_squared_error(YValid[:numTests], yPred, squared= False)

CPU times: user 1min 22s, sys: 6.68 ms, total: 1min 22s
Wall time: 1min 22s


2.207905278370655

In [271]:
def predictNonTemporal(username, animeid, timestamp):
    denom = []
    ratingsForNum = []
    for anime in animePerUser[username]:
        #we cannot consider this exact task if we have it stored somewhere
        if anime == animeid:
            next
        ratingsForNum.append(ratingsDict[username,anime])
        denom.append(Pearson(animeid, anime))
    num = [ratingsForNum[i] * denom[i] for i in range(len(denom))]
    #sometimes this will happen, especially for niche anime. let's just return the average score...
    if sum(denom) == 0:
        if animeid in averageScorePerAnime:
            return averageScorePerAnime[animeid]
        elif animeid in animelistdict and 'mean' in animelistdict[animeid]:
            return animelistdict[animeid]['mean']
        else:
            return 5
    #let's just project this into our output zone too
    result = sum(num)/sum(denom)
    if result > 10:
        result = 10
    elif result < 0:
        result = 0
    return result

In [272]:
def predictMultipleNonTemporal(X):
    yPred = []
    for i in X:
        yPred.append(predictNonTemporal(i[0], i[1], i[2]))
    return yPred

In [273]:
%%time
yPred = predictMultipleNonTemporal(XValid[:numTests])
metrics.mean_squared_error(YValid[:numTests], yPred, squared= False)

CPU times: user 14.1 s, sys: 15 µs, total: 14.1 s
Wall time: 14.1 s


2.2100237749944793

In [274]:
def predictAverageScoreForAnime(username, animeid, timestamp):
    if animeid in averageScorePerAnime:
        return averageScorePerAnime[animeid]
    elif animeid in animelistdict and 'mean' in animelistdict[animeid]:
        return animelistdict[animeid]['mean']
    else:
        return 5

In [275]:
def predictMultipleAverageScoreForAnime(X):
    yPred = []
    for i in X:
        yPred.append(predictAverageScoreForAnime(i[0], i[1], i[2]))
    return yPred

In [276]:
%%time
yPred = predictMultipleAverageScoreForAnime(XValid[:numTests])
metrics.mean_squared_error(YValid[:numTests], yPred, squared= False)

CPU times: user 33.2 ms, sys: 0 ns, total: 33.2 ms
Wall time: 32.9 ms


1.4260072270892707

In [277]:
def predictAverageScoreForUser(username, animeid, timestamp):
    if username in scoresPerUser:
        return sum(scoresPerUser[username])/len(scoresPerUser[username])
    elif animeid in averageScorePerAnime:
        return averageScorePerAnime[animeid]
    elif animeid in animelistdict and 'mean' in animelistdict[animeid]:
        return animelistdict[animeid]['mean']
    else:
        return 5

In [278]:
def predictMultipleAverageScoreForUser(X):
    yPred = []
    for i in X:
        yPred.append(predictAverageScoreForUser(i[0], i[1], i[2]))
    return yPred

In [279]:
%%time
yPred = predictMultipleAverageScoreForUser(XValid[:numTests])
metrics.mean_squared_error(YValid[:numTests], yPred, squared= False)

CPU times: user 121 ms, sys: 3 µs, total: 121 ms
Wall time: 120 ms


1.3617474788832709

In [None]:
#######################
## OLD VER           ##
#######################
#data we are getting from our scored subset
scoresPerAnime = defaultdict(list)
scoresPerUser = defaultdict(list)
usersPerAnime = defaultdict(set)
animePerUser = defaultdict(set)
ratingsDict = defaultdict(list)

def mylist():
    return [[] for j in range(81)]

#note there are 81 genres, and we will put individual scores into each one
genreScoresPerUser = defaultdict(mylist) 
for i in userData:
    ratingsDict[(i[0],i[1])] = i[4]
    scoresPerAnime[i[1]].append(i[4])
    scoresPerUser[i[0]].append(i[4])
    usersPerAnime[i[1]].add(i[0])
    animePerUser[i[0]].add(i[1])
    
    if i[1] in animelistdict:
        if 'genres' in animelistdict[i[1]]:
            for genre in animelistdict[i[1]]['genres']:
                genreScoresPerUser[i[0]][int(genre['id'])-1].append(i[4])
#data we are getting from entire dataset
dropsPerAnime = defaultdict(int)
dropsPerUser = defaultdict(int)
for i in readCSV('../data/user_animelist_details_16mil.txt'):
    if i[6] == 3:
        dropsPerAnime[i[1]] += 1
        dropsPerUser[i[0]] += 1

In [None]:
#this uses userData = completed_with_date, while the above does not

In [10]:
#getting one hot encoding of user and anime
userIDs, itemIDs = {},{}
for i in userData:
    u,a = i[0],i[1]
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not a in itemIDs: itemIDs[a] = len(itemIDs)
nUsers,nItems = len(userIDs), len(itemIDs)
nUsers,nItems

(42176, 11249)

In [11]:
#let's start building our feature representation for the dataset
#We will have
#[one hot encoding of user, one hot encoding of item, user avgs of genre, item genres]
#length of feature will be 42176 + 11249 + 82 + 82

In [13]:
random.shuffle(userData)

In [26]:
%time
X = scipy.sparse.lil_array((10000,nUsers+nItems+2*82))
Y = np.array([])
for i in range(len(userData[:10000])):
    Y = np.append(Y, userData[i][4])
    user = userIDs[userData[i][0]]
    anime = itemIDs[userData[i][1]]
    #one hot encodings
    X[i,user] = 1 
    X[i,nUsers+anime]=1
    #user averages of genres
    genreScores = genreScoresPerUser[userData[i][0]]
    for j in range(len(genreScores)):
        if len(genreScores[j]) != 0:
            X[i,nUsers + nItems + j] = sum(genreScores[j])/len(genreScores[j])
        #else:
            # X[i,nUsers + nItems + j] = 5 # we don't know how the user feels about anime in these genres
    #item scores
    if userData[i][1] in userData and 'genres' in animelistdict[userData[i][1]]:
        animeGenres = animelistdict[userData[i][1]]['genres']
        for j in range(len(animeGenres)):
            X[i, nUsers + nItems + 82 + j] = animeGenres[j]['id']
    #else:
        #these entries will be zero, so do nothing
    if i % 1000 == 0:
        print(i)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 12.4 µs
0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [31]:
#dividing up dataset
Xtrain = X[:np.shape(X)[0]//2]
XValid = X[np.shape(X)[0]//2:np.shape(X)[0]*3//4]
XTest = X[np.shape(X)[0]*3//4:]
Ytrain = Y[:np.shape(Y)[0]//2]
YValid = Y[np.shape(Y)[0]//2:np.shape(X)[0]*3//4]
YTest = Y[np.shape(Y)[0]*3//4:]

In [32]:
#getting model
# param:
#  0. regression
#  1. model scale: 0.1
#  2. number of latent factor: 4
#  3. learning rate: 0.1
#  4. regular lambda: 0.01
#  5. use sgd optimization method
#  6. use 16 cpu threads
#  6. early stop at epoch 10
#  6. evaluation metric: rmse
fm_model = xl.FMModel(task='reg', init=0.1, k=200, lr=0.1, 
                      reg_lambda=0.01, opt='sgd', nthread= 16, epoch=10,
                      metric='rmse')


In [33]:
# Start to train
fm_model.fit(Xtrain, 
             Ytrain, eval_set=[XValid, YValid])

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 16 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (/tmp/tmpeo7groz1.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (/tmp/tmpru2l9lkg.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mNumber of Feature: 53506
[32m

In [34]:
len(fm_model.weights[1])

53506

In [35]:
YPred = fm_model.predict(XTest)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[39m[0m[32m[------------] [0mxLearn uses 12 threads for prediction task.
[32m[1m[ ACTION     ] Load model ...[0m
[32m[------------] [0mLoad model from /tmp/tmppwlnhrir
[32m[------------] [0mLoss function: squared
[32m[------------] [0mScore function: fm
[32m[------------] [0mNumber of Feature: 53506
[32m[------------] [0mNumber of K: 1000
[32m[------------] [0mTime cost for loading model: 0.09 (sec)
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[-------

In [36]:
metrics.mean_squared_error(YTest, YPred, squared= False)

1.5345710991809796

In [37]:
for i in range(len(YPred)):
    print('Predicted:', YPred[i], ' Actual:', YTest[i])

Predicted: 8.49395  Actual: 8.0
Predicted: 7.637  Actual: 10.0
Predicted: 7.14254  Actual: 9.0
Predicted: 7.11809  Actual: 10.0
Predicted: 7.61026  Actual: 5.0
Predicted: 7.51548  Actual: 10.0
Predicted: 8.03752  Actual: 10.0
Predicted: 7.18239  Actual: 7.0
Predicted: 7.94708  Actual: 8.0
Predicted: 7.95407  Actual: 9.0
Predicted: 7.57099  Actual: 9.0
Predicted: 7.49275  Actual: 9.0
Predicted: 7.53703  Actual: 7.0
Predicted: 7.10314  Actual: 5.0
Predicted: 7.59689  Actual: 9.0
Predicted: 8.54355  Actual: 8.0
Predicted: 8.58792  Actual: 10.0
Predicted: 7.10882  Actual: 6.0
Predicted: 7.90861  Actual: 6.0
Predicted: 7.44243  Actual: 6.0
Predicted: 7.83871  Actual: 3.0
Predicted: 7.69549  Actual: 8.0
Predicted: 7.58496  Actual: 8.0
Predicted: 7.54078  Actual: 7.0
Predicted: 7.50094  Actual: 8.0
Predicted: 7.96195  Actual: 7.0
Predicted: 8.37391  Actual: 9.0
Predicted: 8.33446  Actual: 7.0
Predicted: 7.86003  Actual: 8.0
Predicted: 7.76168  Actual: 6.0
Predicted: 8.29405  Actual: 10.0
Pred