In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import random

In [2]:
# Question 1
def readCSVq1(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],1

In [3]:
dataset = list(readCSVq1("trainInteractions.csv.gz"))

In [4]:
random.shuffle(dataset)
train = dataset[:400000]
valid = dataset[400000:]

In [58]:
usersPerRecipe = defaultdict(set)
recipesPerUser = defaultdict(set)

for user,recipe,d in valid:
    usersPerRecipe[recipe].add(user)
    recipesPerUser[user].add(recipe)
    
allrecipe = set(usersPerRecipe.keys())

In [None]:
# Add negative samples to validation set
import random
negative = []
for d in valid:
    r = allrecipe.difference(recipesPerUser[d[0]])
    negative.append((d[0],random.sample(r,1)[0],0))        

In [32]:
valid.extend(negative)

In [5]:
def Accuracy(predictions, y):

    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and (1-l)) for (p,l) in zip(predictions, y)])
    TN = sum([((1-p) and (1-l)) for (p,l) in zip(predictions, y)])
    FN = sum([((1-p) and l) for (p,l) in zip(predictions, y)])
    
    acc = (TP + TN)/(TP + FP + TN + FN)
    return acc

In [103]:
def popular(threshold=0.5):
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in train:
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*threshold: break
            
    y_valid = [d[2] for d in valid]
    yPred = []
    for user,recipe,r in valid:
        if recipe in return1:
            yPred.append(1)
        else:
            yPred.append(0)
    
    return Accuracy(yPred,y_valid)

In [109]:
popular(threshold=0.71) #0.70040

0.598765

In [7]:
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in train:
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalCooked*0.65: break

predictions = open("predictions_Made_4.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if i in return1:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")

predictions.close()

In [7]:
usersPerRecipe_train = defaultdict(set)
recipesPerUser_train = defaultdict(set)

for user,recipe,d in train:
    usersPerRecipe_train[recipe].add(user)
    recipesPerUser_train[user].add(recipe)

In [8]:
usersPerRecipe = defaultdict(set)
recipesPerUser = defaultdict(set)

for user,recipe,d in dataset:
    usersPerRecipe[recipe].add(user)
    recipesPerUser[user].add(recipe)

In [9]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [10]:
def Similar(threshold):
    yPred = []
    for u,r,_ in valid:
        similarities = []
        users = usersPerRecipe_train[r]
        recipes = recipesPerUser_train[u]
        if recipes == set():
            yPred.append(0)
        else:
            for i2 in recipes:
                if i2 == r: continue
                sim = Jaccard(users, usersPerRecipe_train[i2])
                similarities.append(sim)
            if max(similarities)>threshold:
                yPred.append(1)
            else:
                yPred.append(0)

    return Accuracy(yPred,y_valid)

In [51]:
def integrate_1(threshold1=0.67):
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in dataset:
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0

    for ic, i in mostPopular:
            count += ic
            return1.add(i)
            if count > totalCooked*threshold1: break
    
    return return1

In [52]:
return1 = integrate_1()

In [53]:
def integrate_test(d,threshold2=0.57):
            
    #yPred_1 = []
    if d[1] in return1:
        yPred_1=1
    else:
        yPred_1=0
            
    #yPred_2 = []

    similarities = []
    users = usersPerRecipe_train[d[1]]
    recipes = recipesPerUser_train[d[0]]
    if recipes == set():
        yPred_2=0
    else:
        for i2 in recipes:
            if i2 == d[1]: continue
            sim = Jaccard(users, usersPerRecipe_train[i2])
            similarities.append(sim)
        if max(similarities)>threshold2:
            yPred_2=1
        else:
            yPred_2=0
                
    yPred = yPred_1 or yPred_2
    return(yPred)

In [34]:
def integrate_test_1(d,threshold2=0.55):
            
    #yPred_1 = []
    if d[1] in return1:
        yPred_1=1
    else:
        yPred_1=0
            
    #yPred_2 = []

    similarities = []
    users = usersPerRecipe[d[1]]
    recipes = recipesPerUser[d[0]]
    if recipes == set():
        yPred_2=0
    else:
        for i2 in recipes:
            if i2 == d[1]: continue
            sim = Jaccard(users, usersPerRecipe[i2])
            similarities.append(sim)
        if max(similarities)>threshold2:
            yPred_2=1
        else:
            yPred_2=0
                
    yPred = yPred_1 or yPred_2
    return(yPred)

In [54]:
predictions = open("predictions_Made_4.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + "," + str(integrate_test(d=(u,i))) + "\n")

predictions.close()

In [50]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [51]:
dataset1 = list(parse("trainRecipes.json.gz"))

In [52]:
import collections
import pandas as pd
ingredients = []
for d in dataset1: 
    for x in d['ingredients']:
        ingredients.append(x)
top50 = collections.Counter(ingredients).most_common(50)

In [53]:
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)
for d in dataset1:
    r = d['recipe_id']
    for i in d['ingredients']:
        ingsPerItem[r].add(i)
        itemsPerIng[i].add(r)

In [54]:
def integrate_test_2(d,threshold2=0.55):
            
    #yPred_1 = []
    if d[1] in return1:
        yPred_1=1
    else:
        yPred_1=0
            
    #yPred_2 = []

    similarities = []
    ings = ingsPerItem[d[1]]

    for i2 in ingsPerItem:
        if i2 == d[1]: continue
        sim = Jaccard(ings, ingsPerItem[i2])
        similarities.append(sim)
    if max(similarities)>threshold2:
        yPred_2=1
    else:
        yPred_2=0
                
    yPred = yPred_1 or yPred_2
    return(yPred)

In [56]:
predictions = open("predictions_Made_3.txt", 'w')
for l in open("stub_Made.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + "," + str(integrate_test_2(d=(u,i))) + "\n")

predictions.close()

In [127]:
def mostSimilar8(i, N):
    similarities = []
    ings = ingsPerItem[i]
    for i2 in ingsPerItem:
        if i2 == i: continue
        sim = Jaccard(ings, ingsPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    similarities_dict = {list(i)[0]:[] for i in similarities}
    for i in similarities:
        similarities_dict[list(i)[0]].append(list(i)[1])
    for i in similarities_dict:
        similarities_dict[i].sort()    
    similarities_sorted = dict(sorted(similarities_dict.items(), key=lambda x: x[0], reverse=True))
    similarities_1 = []
    for k,v in similarities_sorted.items():
        for i in v:
            similarities_1.append((k,i))
    return [r[1] for r in similarities_1[:N]]

In [128]:
mostSimilar8('06432987', 5)

['68523854', '12679596', '56301588', '79675099', '87359281']

In [None]:
def similar(N=5):
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in train:
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked*threshold: break
            
    y_valid = [d[2] for d in valid]
    yPred = []
    for user,recipe,r in valid:
        if recipe in return1:
            yPred.append(1)
        else:
            yPred.append(0)
    
    return Accuracy(yPred,y_valid)

In [None]:
## ---------Rating Prediction-------------

In [5]:
import scipy
import tensorflow as tf

def readCSVq2(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],int(d['rating'])

In [6]:
dataset2 = list(readCSVq2("trainInteractions.csv.gz"))

In [10]:
random.shuffle(dataset2)
train2 = dataset2[:400000]
valid2 = dataset2[400000:]

In [16]:
# Using training set for initialization
userIDs = {}
itemIDs = {}
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)

for d in train2:
    u = d[0]
    i = d[1]
    r = d[2]
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [154]:
mu = sum([r for _,_,r in train2]) / len(train2)

In [191]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [169]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        if u in userIDs:
            b_u = self.betaU[userIDs[u]]
            g_u = self.gammaU[userIDs[u]]
        else:
            b_u = 0
            g_u = 0
            
        if i in itemIDs:
            b_i = self.betaI[itemIDs[i]]
            g_i = self.gammaI[itemIDs[i]]
        else:
            b_i = 0
            g_i = 0
        
        if (u in userIDs)&(i in itemIDs):
            p = self.alpha + b_u + b_i + tf.tensordot(g_u, g_i, 1)
        else:
            p = self.alpha + b_u + b_i
        
        return p
    
    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [205]:
modelLFM = LatentFactorModel(mu, 5, 0.001)

In [197]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [206]:
for i in range(100):
    obj = trainingStep(modelLFM, train2)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 1.0506487
iteration 20, objective = 1.2803149
iteration 30, objective = 0.71163666
iteration 40, objective = 0.56824106
iteration 50, objective = 0.4817013
iteration 60, objective = 0.4676493
iteration 70, objective = 0.4711727
iteration 80, objective = 0.46949345
iteration 90, objective = 0.46731377
iteration 100, objective = 0.4663952


In [7]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [186]:
labels = [r for _,_,r in train2]
Predictions =\
    [modelLFM.predict(u,i).numpy() for u,i,_ in train2]

print("MSE on training set is:")
MSE(Predictions, labels)

MSE on training set is:


0.5690364080130236

In [207]:
labels = [r for _,_,r in valid2]
Predictions =\
    [modelLFM.predict(u,i).numpy() for u,i,_ in valid2]

print("MSE on validation set is:")
MSE(Predictions, labels)

MSE on validation set is:


0.8729675636671943

In [204]:
# Prediction on test data
allRatings = []
userRatings = defaultdict(list)

predictions = open("predictions_Rated_1.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(modelLFM.predict(u,i).numpy()) + '\n')

predictions.close()

In [None]:
## Simple LFM model

In [8]:
import numpy

In [9]:
userIDs = {}
itemIDs = {}
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)

for d in dataset2:
    u = d[0]
    i = d[1]
    r = d[2]
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [10]:
N = len(dataset2)
nUsers = len(userIDs)
nItems = len(itemIDs)
users = list(userIDs.keys())
items = list(itemIDs.keys())

In [11]:
alpha = sum([r for _,_,r in dataset2]) / len(dataset2)
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [12]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

In [13]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))

In [14]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in dataset2]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [17]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(dataset2)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for d in dataset2:
        u,i,r = d[0], d[1], d[2]
        pred = prediction(u, i)
        diff = pred - r
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

In [24]:
labels = [r for _,_,r in dataset2]
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.000018))

MSE = 0.900892329560429
MSE = 0.912094139199284
MSE = 0.8986029223746386
MSE = 0.893968039632259
MSE = 0.8783901141972499
MSE = 0.8773597977351407
MSE = 0.8759599169505846
MSE = 0.8735764547410022
MSE = 0.868335939410115
MSE = 0.8619473966465476
MSE = 0.849562336368106
MSE = 0.8386945366257139
MSE = 0.8375236780071237
MSE = 0.8364364529251572
MSE = 0.8313225566399429
MSE = 0.8243140241313085
MSE = 0.8189316210240377
MSE = 0.8180430155798404
MSE = 0.8169685813046845
MSE = 0.8144613858096241
MSE = 0.8041674118028445
MSE = 0.8015156997439964
MSE = 0.7985761191419435
MSE = 0.7964273085078105
MSE = 0.7930529679799972
MSE = 0.7891658933504208
MSE = 0.7789856826543706
MSE = 0.7733208742969913
MSE = 0.7713988242121539
MSE = 0.7700356251466455
MSE = 0.7692798896699778
MSE = 0.767791692775971
MSE = 0.7640478049340007
MSE = 0.7569499342045326
MSE = 0.7561009234534161
MSE = 0.755834186408082
MSE = 0.7531796810728038
MSE = 1.0498660121222496
MSE = 0.7518246155128584
MSE = 0.7450130655523
MSE = 0.75

(array([ 4.45992407, -0.05264485,  0.29908444, ...,  0.        ,
         0.        ,  0.        ]),
 0.7508028966003583,
 {'grad': array([-2.33960649e-04,  2.13222023e-07,  5.95725680e-06, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
  'task': 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH',
  'funcalls': 210,
  'nit': 167,
  'warnflag': 0})

In [37]:
labels = [r for _,_,r in dataset2]
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.0000177))

MSE = 0.900892329560429
MSE = 0.912094139199284
MSE = 0.8986029140348785
MSE = 0.8939680414190727
MSE = 0.8783903853581955
MSE = 0.8773603492250993
MSE = 0.875958568664379
MSE = 0.8735732618510075
MSE = 0.868329483898093
MSE = 0.8619417586923738
MSE = 0.8495570600376476
MSE = 0.8386894869435184
MSE = 0.8375191422895973
MSE = 0.8364310883615965
MSE = 0.8312962895793983
MSE = 0.824601347159106
MSE = 0.818979252733498
MSE = 0.8180921753971528
MSE = 0.8171254249466019
MSE = 0.8290933178323358
MSE = 0.8164277081500989
MSE = 0.8121110107979523
MSE = 0.8056262550499482
MSE = 0.8012948218687105
MSE = 0.7988894172135668
MSE = 0.798385745496589
MSE = 0.7958669478656714
MSE = 0.7882712036275048
MSE = 0.7860225925645399
MSE = 0.7853296964654692
MSE = 0.7843426868587458
MSE = 0.7838684232921839
MSE = 0.7833461159577079
MSE = 0.7817737268411941
MSE = 0.7784799273963352
MSE = 0.7740531023764013
MSE = 0.7689253104160394
MSE = 0.7670426216741915
MSE = 0.7659696130691561
MSE = 0.7636827642622666
MSE = 0

(array([ 4.45830346, -0.05398531,  0.30300362, ...,  0.        ,
         0.        ,  0.        ]),
 0.7500398451836868,
 {'grad': array([-9.65742874e-04,  3.33521307e-08,  1.79969503e-05, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
  'task': 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH',
  'funcalls': 185,
  'nit': 145,
  'warnflag': 0})

In [23]:
labels = [r for _,_,r in dataset2]
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.000019))

MSE = 0.9177020321053291
MSE = 1.6544952869696699
MSE = 0.9007262123143708
MSE = 0.9005928841836659
MSE = 0.9000639254762641
MSE = 0.8980177516939493
MSE = 0.8909476333213814
MSE = 0.8642956843298382
MSE = 0.847208769496609
MSE = 0.8313734912948623
MSE = 0.8147823798318572
MSE = 0.8077895805035806
MSE = 0.7927502431126554
MSE = 0.7779238274165918
MSE = 0.7635799010412583
MSE = 0.7571727890163625
MSE = 0.7473265437014849
MSE = 0.7365949636724648
MSE = 0.8635218207043825
MSE = 0.7360629281840232
MSE = 0.7345712797437509
MSE = 0.7294860599557363
MSE = 0.726294397330295
MSE = 0.7214162047281146
MSE = 0.7151831146172776
MSE = 0.7110853233913704
MSE = 0.7115523540545712
MSE = 0.7065429167950928
MSE = 0.7040127734927725
MSE = 0.7011089538493986
MSE = 0.6962253647083416
MSE = 0.6981170189785076
MSE = 0.7128819024162811
MSE = 0.6980198973237216
MSE = 0.6968163756806419
MSE = 0.6947268302844494
MSE = 0.6934639234551158
MSE = 0.691848841569688
MSE = 0.6894191026073316
MSE = 0.6866394499807515
MSE

(array([ 4.45184331,  0.03789561, -0.11018859, ...,  0.0852448 ,
         0.03788154,  0.01467725]),
 0.7393261451925227,
 {'grad': array([ 4.96913404e-06,  8.82376342e-08,  1.63835054e-08, ...,
         -2.07550097e-08, -7.61874525e-09, -8.54431091e-09]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 174,
  'nit': 146,
  'warnflag': 0})

In [21]:
labels = [r for _,_,r in dataset2]
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.0000185))#current best

MSE = 0.9180114411884044
MSE = 1.652503818524502
MSE = 0.9007259253622535
MSE = 0.900592609218318
MSE = 0.9000636976612182
MSE = 0.8980176997341913
MSE = 0.8909480808588485
MSE = 0.8642934323002804
MSE = 0.8472058203736941
MSE = 0.8313876969506002
MSE = 0.8147768661212363
MSE = 0.8077627000077132
MSE = 0.7927018396108559
MSE = 0.7778628602954507
MSE = 0.7634776346971955
MSE = 0.7570174918120194
MSE = 0.747084616268834
MSE = 0.7363289003594129
MSE = 0.8648149029607639
MSE = 0.7358002944542071
MSE = 0.7343081292518157
MSE = 0.7289711725213355
MSE = 0.7258068846974075
MSE = 0.7206148140366679
MSE = 0.7143289995138398
MSE = 0.7076976163688928
MSE = 0.702286177709134
MSE = 0.7020147267795441
MSE = 0.7003165621603097
MSE = 0.6948326770631181
MSE = 0.720517947778356
MSE = 0.6947128562602398
MSE = 0.6942524094115656
MSE = 0.6917909092277517
MSE = 0.6909141261339704
MSE = 0.6888211759645867
MSE = 0.6877026889643679
MSE = 0.6863346074259057
MSE = 0.6866928491798835
MSE = 0.6869882878047638
MSE =

(array([ 4.45114176,  0.03821684, -0.11134527, ...,  0.08725998,
         0.03874497,  0.01505306]),
 0.738106351374945,
 {'grad': array([-6.48261993e-06, -7.47459480e-08,  2.36366345e-09, ...,
         -1.02124836e-08, -1.99873043e-09, -1.65557699e-09]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 125,
  'nit': 106,
  'warnflag': 0})

In [19]:
def LFM_pred(u,i):
    if u in userIDs:
        b_u = userBiases[u]
    else:
        b_u = 0
            
    if i in itemIDs:
        b_i = itemBiases[i]
    else:
        b_i = 0

    p = alpha + b_u + b_i        
    return p

In [163]:
label_valid = [r for _,_,r in valid2]
pred_valid = [LFM_pred(u,i) for u,i,_ in valid2]

print("MSE on validation set is:")
MSE(pred_valid, label_valid)

MSE on validation set is:


0.8124466463070734

In [22]:
predictions = open("predictions_Rated_7.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(LFM_pred(u,i)) + '\n')

predictions.close()

In [None]:
## with Gamma

In [116]:
alpha = sum([r for _,_,r in dataset2]) / len(dataset2)
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}
K = 2

In [117]:
for u in userIDs:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [118]:
for i in itemIDs:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [119]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [120]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [121]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [122]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in dataset2]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [125]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(dataset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in userIDs:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in itemIDs:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in dataset2:
        u,i,r = d[0], d[1], d[2]
        pred = prediction(u, i)
        diff = pred - r
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [127]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0001]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.00005))

MSE = 0.9080434122995151
MSE = 1.7294306228010499
MSE = 0.9007388227126125
MSE = 0.9006046665398597
MSE = 0.9000724498355834
MSE = 0.8980141108064839
MSE = 0.890909199278654
MSE = 0.8643288543476435
MSE = 0.847303433607903
MSE = 0.8310509550674388
MSE = 0.8150932070561523
MSE = 0.8091408928072984
MSE = 0.7956434915046718
MSE = 0.782785648160797
MSE = 0.771192266664044
MSE = 0.7674168654692667
MSE = 0.7619838260746284
MSE = 0.7582179856822338
MSE = 0.7535894645114701
MSE = 0.7532815207391773
MSE = 0.7525114019496402
MSE = 0.7781583652910753
MSE = 0.753160584226129
MSE = 0.7517301370160928
MSE = 0.7503922299217
MSE = 0.7498888451805482
MSE = 0.7494045086973435
MSE = 0.7493344521392993
MSE = 0.7495761506500146
MSE = 0.7495205057019675
MSE = 0.7527091828771548
MSE = 0.749671540021093
MSE = 0.7492733683259961
MSE = 0.7487982663545615
MSE = 0.7479521294705402
MSE = 0.7451552162141353
MSE = 0.7470985757057704
MSE = 0.7471192727537698
MSE = 0.7465573155406758
MSE = 0.7474544217175062
MSE = 0.7

KeyboardInterrupt: 

In [138]:
def LFM_gamma_pred(u,i):
    if u in userIDs:
        b_u = userBiases[u]
        g_u = userGamma[u]
    else:
        b_u = 0
        g_u = 0
            
    if i in itemIDs:
        b_i = itemBiases[i]
        g_i = itemGamma[i]
    else:
        b_i = 0
        g_i = 0
        
    if (u in userIDs)&(i in itemIDs):
        p = alpha + b_u + b_i + inner(g_u, g_i)
    else:
        p = alpha + b_u + b_i
    return(p)

In [139]:
predictions = open("predictions_Rated_4.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(LFM_gamma_pred(u,i)) + '\n')

predictions.close()