In [1]:
import pandas as pd
import numpy as np

In [2]:
users = [1,2,3,4,5,6] # user indexes 
items = [1,2,3,4,5,6] # item indexes

Zeilen: User, Spalten: Items

In [3]:
ratings_mat = [[1,3,1,0,5,4],
           [5,4,4,0,1,0],
           [2,0,5,4,5,0],
           [0,3,0,0,0,5],
           [0,2,0,5,4,0],
           [0,0,4,4,0,5]
          ]

In [4]:
rating_df = pd.DataFrame(ratings_mat, columns=items, index=users)
rating_df

Unnamed: 0,1,2,3,4,5,6
1,1,3,1,0,5,4
2,5,4,4,0,1,0
3,2,0,5,4,5,0
4,0,3,0,0,0,5
5,0,2,0,5,4,0
6,0,0,4,4,0,5


Similarity metric: cosine similarity

In [5]:
def cos_similarity_common_ratings(x,y):
    one_hot_filter = np.where(x*y > 0.0, 1, 0)
    x_filtered = one_hot_filter*x
    y_filtered = one_hot_filter*y
    
    len_x_filtered = np.sqrt(np.dot(x_filtered,x_filtered))
    len_y_filtered = np.sqrt(np.dot(y_filtered,y_filtered))
    if len_x_filtered == 0.0 or len_y_filtered == 0.0:
        return 0.0
    return np.dot(x,y)/(len_x_filtered * len_y_filtered)

Example: Similarity between user $x$ and $y$

In [6]:
user_x = 3
user_y = 5

x = rating_df.loc[user_x,:].values
y = rating_df.loc[user_y,:].values
cos_similarity_common_ratings(x,y)

0.975609756097561

Construct user similarity-table:

In [7]:
user_similarity_df = pd.DataFrame() # empty df
for user_x in users:
    for user_y in users:
        if user_x == user_y:
            user_similarity_df.loc[user_x, user_y] = 0
        else:
            x = rating_df.loc[user_x,:].values
            y = rating_df.loc[user_y,:].values
            user_similarity_df.loc[user_x, user_y] = cos_similarity_common_ratings(x,y)
                        
user_similarity_df

Unnamed: 0,1,2,3,4,5,6
1,0.0,0.568995,0.838052,0.994692,0.997054,0.909065
2,0.568995,0.0,0.734931,1.0,0.650791,1.0
3,0.838052,0.734931,0.0,0.0,0.97561,0.993884
4,0.994692,1.0,0.0,0.0,1.0,1.0
5,0.997054,0.650791,0.97561,1.0,0.0,1.0
6,0.909065,1.0,0.993884,1.0,1.0,0.0


Construct recommendations with user-based filtering

In [8]:
def weighted_av(x,w):
    x_onehot = np.where(x > 0.0, 1, 0)
    if np.sum(w) == 0:
        return np.nan
    else:
        return np.sum(x*w) / np.sum(w*x_onehot) # we only want to consider weights, where we have ratings

In [9]:
user_based_threshold = 0.8

In [10]:
user_based_recs = []
for user in users:
    for item in items:
        if rating_df.loc[user][item] > 0:
            # do nothing because user rated this item already, i.e. it is not new content
            pass
        else:
            item_ratings = rating_df[item].values
            user_similarities = user_similarity_df[user].values
            sel = (item_ratings > 0) & (user_similarities > user_based_threshold)
            user_based_rating = weighted_av(item_ratings[sel], user_similarities[sel])
            if np.isfinite(user_based_rating):
                user_based_recs.append({'user':user, 'item':item, 'rating': user_based_rating})
user_based_df = pd.DataFrame(user_based_recs)

In [11]:
user_based_df

Unnamed: 0,user,item,rating
0,1,4,4.363335
1,2,4,4.0
2,2,6,5.0
3,3,2,2.462077
4,3,6,4.542532
5,4,1,3.005322
6,4,3,3.003545
7,4,4,4.5
8,4,5,3.330379
9,5,1,1.494565


Print the top $n$ recommended items for user

In [12]:
user = 4
rec_df = user_based_df[user_based_df['user']==user].sort_values(by='rating', ascending=False)[:3]
print(rec_df,'\n')

   user  item    rating
7     4     4  4.500000
8     4     5  3.330379
5     4     1  3.005322 

