# Collaborative Filtering

In [1]:
import pandas as pd
import seaborn as sns
import random

In [2]:
df = pd.read_clipboard()
df.head()

KeyboardInterrupt: 

In [None]:
df.to_csv('movies.csv')

### Movie Recommender 1: random

In [None]:
# not with uniform probabilities
i = random.randint(0, len(df.index)-1) # uniform
df.iloc[i]['movie']

In [None]:
# pick multiple random rows
df['movie'].sample(3).values

In [None]:
# with uniform probabilities
names = df['movie'].unique()
random.choice(names)

### Movie Recommender 2: find the *best* movies

In [None]:
df.sort_values(by='rating (1-worst; 5-best)', ascending=False).head(5)

In [None]:
df[df['rating (1-worst; 5-best)'] == 5].sample(3)

In [None]:
# average rating
df.groupby('movie')['rating (1-worst; 5-best)'].mean().sort_values(ascending=False).head(5)

### Movie Recommender 3: also consider the number of votes

In [None]:
meancount = df.groupby('movie')['rating (1-worst; 5-best)'].agg(['mean', 'count']) # also try .describe()
meancount.head(3)

In [None]:
mc = meancount.sort_values(by=['mean', 'count'], ascending=[False, False])
mc.tail(5)

We are using a **ranking algorithm** to combine mean and count into a single number.

* R - average rating for one movie
* v - number of votes for one movie
* C - average rating of all movies
* m - threshold hyperparameter (set this manually)

In [None]:
def weighted_rank(R, v, C, m):
    return R * v / (v+m) + C * m / (v+m)

In [None]:
C = df['rating (1-worst; 5-best)'].mean()
m = 100.0  # experiment with this

print("Praxis Dr. Hasenbein", round(weighted_rank(7.0, 1, C, m), 3))
print("Lord of the Rings   ", round(weighted_rank(5.0, 3, C, m), 3))
print("Lord of the Rings II", round(weighted_rank(5.0, 1000, C, m), 3))
print("John Wick 3         ", round(weighted_rank(2.0, 1, C, m), 3))

In [None]:
# apply the ranking algorithm to an entire df:
# (iterate over the rows OR)
v = mc['count']
R = mc['mean']
m = 1000000.0 

ranking = R * v / (v+m) + C * m / (v+m)
ranking.head()

### Neighbourhood-based Search

In [None]:
# create a multi index
multi = df.set_index(['name', 'movie'])['rating (1-worst; 5-best)']
matrix = multi.unstack().T.fillna(0)
matrix

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(8, 8))
sns.heatmap(matrix, cbar=None)

### how to create recommendations based on *user similarity*
 
    u1 = "Hakan"
    go through all users u2 (!= u1):
        calculate the similarity between u1 and u2 -> sim
        (0.0 dissimilar, 1.0 identical)
        find the user with the highest similarity -> u_best
       
    sort the movies rated by u_best by rating
    recommend a few of them
    
variation: consider the K most similar users instead (K Nearest Neighbors)

to calculate the similarity of two users: try **cosine similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# cosine_similarity(matrix['Hakan'], matrix['Inbal']) # TODO: fix

In [None]:
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_tags = pd.read_csv('ml-latest-small/tags.csv')
df_links = pd.read_csv('ml-latest-small/links.csv')

In [None]:
df_ratings.info()

In [None]:
#new column with mean rating per movie
df_ratings['av_rating']=df_ratings.groupby(['movieId'])['rating'].transform('mean')

#new column with number of ratings per movie
df_ratings['no_of_ratings']=df_ratings.groupby(['movieId'])['rating'].transform('count')

In [None]:
# new column with ranking of movie according to weighted_rank()
v = df_ratings['no_of_ratings']
R = df_ratings['av_rating']
C = df_ratings['rating'].mean()
m = 100.0 

df_ratings['weighted_rank']=weighted_rank(R,v,C,m) # use function from above as input for new column/variable

#df_ratings['weighted_rank']=R * v / (v+m) + C * m / (v+m) # using this would have the same result

In [None]:
#merge df movies with df ratings by movieId
df_ls=pd.merge(df_ratings,df_movies,on='movieId')

In [None]:
df_ls.shape, df_ratings.shape, df_movies.shape

In [None]:
#number of different movie Ids
df_ls.movieId.unique().shape

In [None]:
#sort movies by weighted rating and drop duplicates
df_ls.sort_values('weighted_rank',ascending=False).drop_duplicates('movieId')