<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Collaborative Filtering: Memory Based</H1></u></center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df_movies = pd.read_csv('../Data/ml-100k/u.data', sep='\t', names=header)
df_movies.head()

## Data exploration:

In [None]:
n_users = df_movies.user_id.unique().shape[0]
print(n_users)

In [None]:
n_items = df_movies.item_id.unique().shape[0]
print(n_items)

In [None]:
plt.hist(df_movies['rating'])

In [None]:
df_movies.groupby(['rating'])['user_id'].count()

In [None]:
# creating a matrix of ratings
_ratings = np.zeros((n_users, n_items))
for row in df_movies.itertuples():
    _ratings[row[1]-1, row[2]-1] = row[3]
_ratings

In [None]:
ratings_ = df_movies.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)
ratings_.head(10)

In [None]:
ratings = ratings_.as_matrix()
ratings

In [None]:
ratings.shape

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
user_based_similarity = pairwise_distances(ratings, metric='cosine')
item_based_similarity = pairwise_distances(ratings.T, metric='cosine')

## Top 10 similar users for the user id 5

In [None]:
pd.DataFrame(user_based_similarity)[:5]

In [None]:
pd.DataFrame(user_based_similarity).loc[4, pd.DataFrame(user_based_similarity).loc[4,:]>0].sort_values(ascending=False)[0:10]

## Top 10 similar movies for the movie id 6

In [None]:
pd.DataFrame(item_based_similarity)[:5]

In [None]:
pd.DataFrame(item_based_similarity).loc[5, pd.DataFrame(item_based_similarity).loc[5,:]>0].sort_values(ascending=False)[0:10]

In [None]:
def item_based_pred(rating_mat, sim_mat):
    pred = rating_mat.dot(sim_mat) / np.array([np.abs(sim_mat).sum(axis=1)])
    return pred

In [None]:
def user_based_pred(rating_mat, sim_mat):
    mean_user_rating = rating_mat.mean(axis=1)
    ratings_diff = (rating_mat - mean_user_rating[:, np.newaxis])
    return mean_user_rating[:, np.newaxis] + sim_mat.dot(ratings_diff) / np.array([np.abs(sim_mat).sum(axis=1)]).T

In [None]:
item_based_prediction = item_based_pred(ratings, item_based_similarity)
user_based_prediction = user_based_pred(ratings, user_based_similarity)

In [None]:
user_based = pd.DataFrame(user_based_prediction)
user_based[:5]

## Predictions for movies that the user 6 hasn't rated yet

In [None]:
predictions = user_based.loc[6,pd.DataFrame(ratings).loc[6,:] == 0]

In [None]:
top_values = predictions.sort_values(ascending=False).head(n=3)

In [None]:
recommendations = pd.DataFrame(data=top_values)
recommendations.columns = ['Predicted Rating']
print(recommendations)

In [None]:
item_based = pd.DataFrame(item_based_prediction)
item_based[:5]

## Predictions for movies that the user 6 hasn't rated yet

In [None]:
predictions_ = item_based.loc[6,pd.DataFrame(ratings).loc[6,:] == 0]

In [None]:
top_values_ = predictions_.sort_values(ascending=False).head(n=3)

In [None]:
recommendations_ = pd.DataFrame(data=top_values_)
recommendations_.columns = ['Predicted Rating']
print(recommendations)

## Calculating Metrics (RMSE):

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
def rmse(pred,actual):
    prediction = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [None]:
print('User-based CF RMSE:' + str(rmse(user_based_prediction, ratings)))

In [None]:
print('Item-based CF RMSE:' + str(rmse(item_based_prediction, ratings)))

## Reference:

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html