<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Collaborative Filtering: Model Based</H1></u></center>

In [None]:
import pandas as pd
import numpy as np

In [None]:
header = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_csv('../Data/ml-100k/u.data', sep='\t', names=header)
df_ratings.head()

In [None]:
user_data = df_ratings[df_ratings.user_id == 2]
user_data.head()

In [None]:
print ('User has already rated {0} movies.'.format(user_data.shape[0]))

In [None]:
header = ['movie_id','movie title','release date','video release date', 'IMDb URL'
          ,'unknown','Action','Adventure','Animation',
              'Childrens','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
          'Thriller','War','Western']
names = ['movie id','movie title']
df_movies = pd.read_csv('../Data/ml-100k/u.item', sep='|',names=header)
df_movies.head()

In [None]:
cols = ['movie_id', 'title']
movies = pd.read_csv('../Data/ml-100k/u.item', sep='|', names=cols, usecols=range(2), encoding='latin-1')
ratings = pd.merge(movies, df_ratings).sort_values(['rating'], ascending=False)
ratings.head()

In [None]:
matrix_ = df_ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
matrix_.head(5)

## Matrix Factorization:

In [None]:
# Example:
myMatrix = np.matrix(np.random.random((5,5)))*5
myMatrix

In [None]:
from scipy.sparse.linalg import svds
u1, s1, vt1 = svds(myMatrix, k = 4)
s_diag=np.diag(s1)
pred = np.dot(np.dot(u1, s_diag), vt1)
df_pred = pd.DataFrame(pred)
df_pred

In [None]:
matrix = matrix_.as_matrix()

In [None]:
#get SVD components from train matrix. Choose k.
u, s, vt = svds(matrix, k = 500)
s_diag_matrix=np.diag(s)
all_user_pred_ratings = np.dot(np.dot(u, s_diag_matrix), vt)
df_predictions = pd.DataFrame(all_user_pred_ratings, columns=matrix_.columns)
df_predictions.head()

## Top 20 movies predictions for a user

In [None]:
sorted_user_predictions = df_predictions.iloc[1].sort_values(ascending=False)
sorted_user_predictions[:20]

## Calculating Metrics (RMSE):

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
def rmse(pred,actual):
    prediction = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [None]:
pred_ratings = np.dot(np.dot(u, s_diag_matrix), vt)

In [None]:
print 'User-based CF MSE: ' + str(rmse(pred_ratings, matrix))

## Sparsity level:

### Sparsity describes the percentage of cells in a database table that are not populated.

In [None]:
n_users = df_ratings.user_id.unique().shape[0]
n_items = df_ratings.movie_id.unique().shape[0]

In [None]:
sparsity=round(1.0-len(df_ratings)/float(n_users*n_items),3)
print('The sparsity level is:' + str(sparsity*100)+'%')

## References:

https://docs.scipy.org/doc/scipy-0.19.1/reference/generated/scipy.sparse.linalg.svds.html

https://grouplens.org/datasets/movielens/latest/