In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

In [18]:
orig_data = pd.read_csv("jester-data-1.csv", header=None)

In [19]:
orig_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [20]:
#The first column gives the number of jokes rated by the user - we don't need that
orig_data.drop(0, axis=1, inplace=True)
orig_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [29]:
data = orig_data.copy().values

In [30]:
valid_count = (int)(data.shape[0] * 0.1)

In [31]:
data[-valid_count:] = 99

In [33]:
print(data)

[[-7.82  8.79 -9.66 ... 99.   99.   99.  ]
 [ 4.08 -0.29  6.36 ...  0.34 -4.32  1.07]
 [99.   99.   99.   ... 99.   99.   99.  ]
 ...
 [99.   99.   99.   ... 99.   99.   99.  ]
 [99.   99.   99.   ... 99.   99.   99.  ]
 [99.   99.   99.   ... 99.   99.   99.  ]]


In [34]:
n_features = 10

latent_user_preferences = np.random.random((data.shape[0], n_features))
latent_jokes_features = np.random.random((data.shape[1], n_features))

In [35]:
print(latent_user_preferences.shape)
print(latent_jokes_features.shape)

(24983, 10)
(100, 10)


In [36]:
def predict_rating(user_id, joke_id):
    """ Predict a rating given a user_id and a joke_id.
    """
    user_preference = latent_user_preferences[user_id]
    joke_feature = latent_jokes_features[joke_id]
    
    return user_preference.dot(joke_feature)

def train(user_id, joke_id, rating, alpha = 0.0001):
    
    prediction_rating = predict_rating(user_id, joke_id)
    err =  (prediction_rating- rating);
    
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_jokes_features[joke_id]
    latent_jokes_features[joke_id] -= alpha * err * user_pref_values
    
    return err
    

def sgd(iterations = 300000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0, iterations):
        error = []
        for user_id in range(0, latent_user_preferences.shape[0]):
            for joke_id in range(0, latent_jokes_features.shape[0]):
                rating = data[user_id][joke_id]
                if(rating != 99):
                    err = train(user_id, joke_id, rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        print(mse)

In [37]:
sgd(5)

25.39680898252591
24.733676205206116
24.670753451255507
24.611634041473067
24.55428730223652


In [38]:
sgd(5)

24.49769469238875
24.440981579454203
24.383262182942474
24.323478126661772
24.260202759040713


In [39]:
predictions = latent_user_preferences.dot(latent_jokes_features.T)
predictions

array([[1.023, 0.279, 0.638, ..., 1.121, 0.549, 1.871],
       [0.805, 0.355, 0.285, ..., 0.59 , 0.022, 1.3  ],
       [1.389, 0.356, 0.843, ..., 1.403, 0.222, 2.05 ],
       ...,
       [1.375, 0.454, 0.803, ..., 1.485, 0.096, 2.077],
       [1.095, 0.177, 0.49 , ..., 1.166, 0.154, 2.114],
       [0.901, 0.424, 0.303, ..., 0.785, 0.19 , 1.533]])

In [40]:
values = [zip(data[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = orig_data.columns
# Fixed the lambda to explicitly access tuple values
comparison_data.applymap(lambda x: "(%2.3f|%2.3f)"%(x[0],x[1]))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,(-7.820|1.023),(8.790|0.279),(-9.660|0.638),(-8.160|-1.034),(-7.520|0.448),(-8.500|1.908),(-9.850|-0.258),(4.170|-0.530),(-8.980|-0.572),(-4.760|1.379),...,(2.820|2.202),(99.000|1.493),(99.000|2.557),(99.000|1.223),(99.000|1.145),(99.000|1.592),(-5.630|2.019),(99.000|1.121),(99.000|0.549),(99.000|1.871)
1,(4.080|0.805),(-0.290|0.355),(6.360|0.285),(4.370|-1.105),(-2.380|0.421),(-9.660|1.233),(-0.730|-0.237),(-5.340|-0.416),(8.880|-0.412),(9.220|0.911),...,(2.820|1.567),(-4.950|0.915),(-0.290|1.797),(7.860|1.061),(-0.190|0.969),(-2.140|1.316),(3.060|1.407),(0.340|0.590),(-4.320|0.022),(1.070|1.300)
2,(99.000|1.389),(99.000|0.356),(99.000|0.843),(99.000|-1.560),(9.030|0.736),(9.270|2.614),(9.030|-0.168),(9.270|-0.786),(99.000|-0.698),(99.000|2.042),...,(99.000|2.645),(99.000|2.263),(99.000|3.391),(9.080|1.364),(99.000|1.430),(99.000|2.023),(99.000|2.681),(99.000|1.403),(99.000|0.222),(99.000|2.050)
3,(99.000|1.006),(8.350|0.422),(99.000|0.407),(99.000|-1.486),(1.800|0.543),(8.160|1.805),(-2.820|-0.275),(6.210|-0.609),(99.000|-0.604),(1.840|1.327),...,(99.000|1.965),(99.000|1.297),(99.000|2.397),(0.530|1.221),(99.000|1.218),(99.000|1.633),(99.000|1.966),(99.000|0.927),(99.000|0.124),(99.000|1.796)
4,(8.500|0.918),(4.610|0.650),(-4.170|0.474),(-5.390|-1.127),(1.360|0.462),(1.600|1.836),(7.040|-0.275),(4.610|-0.504),(-0.440|-0.283),(5.730|1.537),...,(5.190|2.367),(5.580|1.483),(4.270|2.986),(5.190|1.335),(5.730|0.917),(1.550|1.765),(3.110|1.756),(6.550|0.763),(1.800|-0.073),(1.600|1.349)
5,(-6.170|0.765),(-3.540|0.149),(0.440|0.294),(-8.500|-0.868),(-7.090|0.417),(-4.320|1.281),(-8.690|-0.183),(-0.870|-0.275),(-6.650|-0.231),(-1.800|1.205),...,(-3.540|1.935),(-6.890|1.074),(-0.680|2.316),(-2.960|1.174),(-2.180|0.725),(-3.350|1.240),(0.050|1.520),(-9.080|0.670),(-5.050|0.034),(-3.450|1.275)
6,(99.000|1.146),(99.000|0.200),(99.000|0.475),(99.000|-1.572),(8.590|0.640),(-9.850|1.893),(7.720|-0.418),(8.790|-0.447),(99.000|-0.567),(99.000|1.728),...,(99.000|2.784),(99.000|1.416),(99.000|3.239),(99.000|1.621),(99.000|1.493),(2.330|1.905),(99.000|2.323),(99.000|0.926),(99.000|0.376),(99.000|2.016)
7,(6.840|0.946),(3.160|0.270),(9.170|0.427),(-6.210|-1.138),(-8.160|0.449),(-1.700|1.619),(9.270|-0.316),(1.410|-0.340),(-5.190|-0.358),(-4.420|1.473),...,(7.230|2.396),(-1.120|1.213),(-0.100|2.794),(-5.680|1.371),(-3.160|1.170),(-3.350|1.679),(2.140|1.952),(-0.050|0.781),(1.310|0.447),(0.000|1.808)
8,(-3.790|1.312),(-3.540|0.421),(-9.420|0.484),(-6.890|-1.862),(-8.740|0.842),(-0.290|2.028),(-5.290|-0.350),(-8.930|-0.661),(-7.860|-0.716),(-1.600|1.658),...,(4.370|2.576),(-0.290|1.394),(4.170|3.161),(-0.290|1.773),(-0.290|1.575),(-0.290|2.295),(-0.290|2.136),(-0.290|1.094),(-3.400|-0.225),(-4.950|2.102)
9,(3.010|1.019),(5.150|0.392),(5.150|0.575),(3.010|-1.218),(6.410|0.596),(5.150|1.756),(8.930|-0.165),(2.520|-0.502),(3.010|-0.428),(8.160|1.497),...,(99.000|2.097),(4.470|1.402),(99.000|2.619),(99.000|1.176),(99.000|1.068),(99.000|1.570),(99.000|1.918),(99.000|0.877),(99.000|0.077),(99.000|1.518)


In [44]:
test_data = orig_data.copy().values
valid_index = data.shape[0] - valid_count
valid_error = []
print(valid_index)
print(test_data.shape)

22485
(24983, 100)


In [48]:
for user_id in range(valid_index, latent_user_preferences.shape[0]):
    for joke_id in range(0, latent_jokes_features.shape[0]):
        rating = test_data[user_id][joke_id]
        if(rating != 99):
            pred = predictions[user_id][joke_id]
            valid_error.append(pred - rating)
valid_mse = (np.array(valid_error) ** 2).mean()

In [49]:
print(valid_mse)

26.74079549952756
