# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)
print("done")

done


Now load the data

In [2]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print(test.to_latex())



\begin{tabular}{lrrrrrrrr}
\toprule
{} &  Sex &   Over60 &  key &  user\_id &  Critic0 &   Critic1 &  item\_id &  rating \\
\midrule
2 &  1.0 &      0.0 &    0 &        0 &      0.6 &       0.4 &        2 &     NaN \\
2 &  0.0 &      1.0 &    0 &        1 &      0.6 &       0.4 &        2 &     NaN \\
1 &  0.0 &      0.0 &    0 &        2 &      0.9 &       0.3 &        1 &     NaN \\
0 &  1.0 &      0.0 &    0 &        3 &      0.3 &       0.9 &        0 &     NaN \\
1 &  1.0 &      0.0 &    0 &        3 &      0.9 &       0.3 &        1 &     NaN \\
0 &  0.0 &      1.0 &    0 &        4 &      0.3 &       0.9 &        0 &     NaN \\
3 &  0.0 &      0.0 &    0 &        5 &      0.2 &       0.1 &        3 &     NaN \\
4 &  0.0 &      0.0 &    0 &        5 &      0.7 &       0.8 &        4 &     NaN \\
2 &  0.0 &      0.0 &    0 &        6 &      0.6 &       0.4 &        2 &     NaN \\
2 &  0.0 &      1.0 &    0 &        8 &      0.6 &       0.4 &        2 &     NaN \\
1 &  1.0 &      0

In [8]:
n_latent_features = 2

data = train

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print(item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [9]:

def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)
    


In [10]:
for _ in range(0,10): 
    sgd()

0.372786947702
0.367134397707
0.364901833098
0.363665428508
0.362863800259
0.362297064833
0.361874890661
0.361549959342
0.361294885501
0.361092658657


In [11]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print(user_features_weights)
print(item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 0.905  1.301  0.841  1.003  0.885]
 [ 0.497  0.851  1.189  0.728  0.394]
 [ 0.507  0.513  0.008  0.64   1.613]
 [ 1.147  0.364  0.771  0.18   1.231]
 [ 0.925  0.601  1.402  0.062  2.252]
 [ 0.67   0.387  0.379  0.665  0.316]
 [ 0.648  0.889  0.472  0.144 -0.296]
 [ 0.891  0.469  0.109  0.623  0.024]
 [ 0.834  0.524  0.429  0.544  0.304]
 [ 0.347  0.226  0.506  0.996 -0.005]]
[[  1.006e+00   3.594e+00   1.145e+00   4.315e+00   3.862e-01]
 [  2.875e-02   1.802e-02   3.050e-03   2.849e-02   4.540e-03]
 [  4.042e-01   1.207e+00   1.151e+00   6.334e-01   7.679e-01]
 [  1.390e+00   2.135e-02   1.746e+00   5.916e-01   7.351e-01]
 [  7.726e-01   9.199e-01   4.736e-01   1.052e+00   4.029e-01]]


In [17]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns
comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,(8.000|8.005),(2.000|1.543),(nan|5.923),(5.000|5.379),(4.000|4.070)
1,(3.000|2.992),(2.000|2.757),(nan|4.641),(7.000|6.373),(7.000|6.885)
2,(9.000|8.947),(nan|3.097),(7.000|7.564),(8.000|6.959),(5.000|5.522)
3,(nan|17.454),(nan|4.588),(7.000|7.001),(8.000|7.996),(9.000|9.001)
4,(nan|106.509),(1.000|1.017),(8.000|7.992),(3.000|2.999),(7.000|6.990)
5,(2.000|2.000),(3.000|3.000),(5.000|5.001),(nan|6.661),(nan|6.968)
6,(4.000|4.017),(2.000|0.300),(nan|-1.195),(2.000|3.349),(7.000|7.252)
7,(7.000|6.983),(1.000|2.575),(2.000|2.087),(7.000|5.754),(9.000|8.780)
8,(3.000|3.002),(3.000|2.805),(nan|9.697),(7.000|7.145),(3.000|3.030)
9,(4.000|4.052),(nan|0.104),(5.000|4.444),(3.000|3.991),(3.000|2.524)


In [19]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 8.00468253203)","(2.0, 1.54296247414)","(nan, 5.92337869919)","(5.0, 5.37914487122)","(4.0, 4.06985139265)"
1,"(3.0, 2.99223688844)","(2.0, 2.75732656327)","(nan, 4.64078251381)","(7.0, 6.37275972013)","(7.0, 6.88515720566)"
2,"(9.0, 8.94720496999)","(nan, 3.09658841869)","(7.0, 7.56441415522)","(8.0, 6.95915689535)","(5.0, 5.52165639945)"
3,"(nan, 17.4535509778)","(nan, 4.588403074)","(7.0, 7.00111347477)","(8.0, 7.99608583397)","(9.0, 9.00140361916)"
4,"(nan, 106.508719931)","(1.0, 1.01678824114)","(8.0, 7.99244286965)","(3.0, 2.9993865991)","(7.0, 6.9903366565)"
5,"(2.0, 1.99985034152)","(3.0, 3.00038077751)","(5.0, 5.00146328587)","(nan, 6.66061794409)","(nan, 6.96838314483)"
6,"(4.0, 4.01733792564)","(2.0, 0.300167980818)","(nan, -1.1950583576)","(2.0, 3.34891515653)","(7.0, 7.2517555943)"
7,"(7.0, 6.98297290127)","(1.0, 2.57516021528)","(2.0, 2.0872468561)","(7.0, 5.75395052037)","(9.0, 8.78033685445)"
8,"(3.0, 3.00184100002)","(3.0, 2.80475252204)","(nan, 9.69679245087)","(7.0, 7.14492290293)","(3.0, 3.02956348242)"
9,"(4.0, 4.05241874356)","(nan, 0.104240395222)","(5.0, 4.44402796863)","(3.0, 3.99111793919)","(3.0, 2.52398755739)"


In [25]:
d = comparison_data.to_latex()
text_file = open("comparison.tex", "w")
text_file.write("\\documentclass{article}\n\\usepackage{booktabs}\n\\begin{document}")
text_file.write(d)
text_file.write("\\end{document}")
text_file.close()