#### EDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ml-latest-small/ratings.csv', encoding="latin-1",
                     dtype={'userId': str, 'movieId': str})
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.rename(columns={'timestamp': 'rating_timestamp'}, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,rating_timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [3]:
# ratings = pd.read_csv('ratings.csv', encoding="latin-1",
#                      dtype={'userId': str, 'movieId': str})
# ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
# ratings.rename(columns={'timestamp': 'rating_timestamp'}, inplace=True)

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId              100836 non-null object
movieId             100836 non-null object
rating              100836 non-null float64
rating_timestamp    100836 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 3.1+ MB


In [5]:
# Check for any missing values
ratings.isnull().any()

userId              False
movieId             False
rating              False
rating_timestamp    False
dtype: bool

In [6]:
num_users = ratings.userId.nunique()
print(num_users)

610


In [7]:
num_movies = ratings.movieId.nunique()
print(num_movies)

9724


In [8]:
len(ratings)

100836

In [9]:
# Specify the number of users and movies to display in a matrix
num_top_entries = 15

In [10]:
# users who rated most often
top_users = ratings.groupby('userId').rating.count().to_frame('num_movies_rated_by_user')\
            .sort_values(by='num_movies_rated_by_user', ascending=False)[:num_top_entries]
top_users

Unnamed: 0_level_0,num_movies_rated_by_user
userId,Unnamed: 1_level_1
414,2698
599,2478
474,2108
448,1864
274,1346
610,1302
68,1260
380,1218
606,1115
288,1055


In [11]:
# movies that were rated most often
top_movies = ratings.groupby('movieId').rating.count().to_frame('num_users_rated_for_movie')\
             .sort_values(by='num_users_rated_for_movie', ascending=False)[:num_top_entries]
top_movies

Unnamed: 0_level_0,num_users_rated_for_movie
movieId,Unnamed: 1_level_1
356,329
318,317
296,307
593,279
2571,278
260,251
480,238
110,237
589,224
527,220


**Need to be paraphrased since being copied directly**  
Most recommendation models consist of building a user-by-item matrix with some sort of “interaction” number in each cell. If one includes the numerical ratings that users give items, then this is called an _explicit feedback_ model. Alternatively, one may include _implicit feedback_ which are actions by a user that signify a positive or negative preference for a given item (such as viewing the item online). These two scenarios often must be treated differently.

In [12]:
# Create a cross-tab of top users and top movies
top_users_movies = ratings.join(top_users, on='userId', how='inner')\
                   .join(top_movies, on='movieId', how='inner')
pd.crosstab(index = top_users_movies.userId,
            columns = top_users_movies.movieId,
            values = top_users_movies.rating,
            aggfunc = np.sum)

movieId,1,110,1196,2571,260,2858,2959,296,318,356,480,50,527,589,593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
182,4.0,3.5,3.0,5.0,3.5,5.0,5.0,5.0,4.5,5.0,3.5,4.5,4.0,2.0,4.5
249,4.0,5.0,5.0,5.0,5.0,4.5,5.0,4.0,4.5,4.5,4.0,4.0,4.5,4.0,4.0
274,4.0,4.5,4.5,4.0,3.0,5.0,5.0,5.0,4.5,4.5,3.5,4.0,4.0,4.5,4.0
288,4.5,5.0,4.5,3.0,5.0,,3.5,5.0,5.0,5.0,2.0,,5.0,4.0,5.0
307,4.0,3.5,3.0,3.5,3.5,4.0,4.0,4.5,4.5,4.0,3.5,4.5,4.5,2.5,4.5
380,5.0,4.0,5.0,4.5,5.0,,4.0,5.0,3.0,5.0,5.0,4.0,,5.0,5.0
387,,3.5,4.5,4.0,4.5,4.5,4.5,5.0,3.5,4.0,3.0,4.5,,3.5,4.0
414,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0,4.0
448,5.0,,5.0,2.0,5.0,4.0,4.0,5.0,,3.0,3.0,4.0,,3.0,5.0
474,4.0,3.0,5.0,4.5,4.0,3.5,4.0,4.0,5.0,3.0,4.5,4.0,5.0,4.0,4.5


The above matrix will be very sparse for users who have not rated much or movies that have not been rated much.

In [13]:
# 10 movies that were rated the most often

In [14]:
# 10 movies with highest average ratings

In [15]:
# 10 users who give the lowest average ratings

In [16]:
# 10 users who give the highest average ratings

In [17]:
# count ratings by year, show distribution

In [18]:
movies = pd.read_csv('ml-latest-small/movies.csv', encoding="latin-1",
                     dtype={'movieId': str})
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null object
title      9742 non-null object
genres     9742 non-null object
dtypes: object(3)
memory usage: 228.5+ KB


In [20]:
# encode the genres column

In [21]:
tags = pd.read_csv('ml-latest-small/tags.csv', encoding="latin-1",
                     dtype={'userId': str, 'movieId': str})
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')
tags.rename(columns={'timestamp': 'tags_timestamp'}, inplace=True)
tags.head()

Unnamed: 0,userId,movieId,tag,tags_timestamp
0,2,60756,funny,2015-10-24 19:29:54
1,2,60756,Highly quotable,2015-10-24 19:29:56
2,2,60756,will ferrell,2015-10-24 19:29:52
3,2,89774,Boxing story,2015-10-24 19:33:27
4,2,89774,MMA,2015-10-24 19:33:20


In [22]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
userId            3683 non-null object
movieId           3683 non-null object
tag               3683 non-null object
tags_timestamp    3683 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 115.2+ KB


In [23]:
links = pd.read_csv('ml-latest-small/links.csv', encoding="latin-1",
                     dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [24]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null object
imdbId     9742 non-null object
tmdbId     9734 non-null object
dtypes: object(3)
memory usage: 228.5+ KB


### Collaborative Filtering

#### Overview

![overview map](Collaborative_Filtering_in_Recommender_Systems.jpg)

**Note:** The "neighborhood-based" approach is also called the "memory-based" approach as in the following diagram. 

![memory vs model based diagram](memory_vs_model_based_diagram.png)

#### Memory-based approach consists of user-based and item-based CF.

* This approach is non-parametric (i.e., no parameters to be learned). 
* Algorithms to be used: cosine similarity, Pearson correlation coefficients, KNN

#### Install `surprise` package: `pip install surprise` in anaconda power shell prompt or something

In [25]:
from surprise import Reader, Dataset

# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [26]:
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate

algo = KNNBasic()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9545  0.9381  0.9573  0.9436  0.9505  0.9488  0.0071  
MAE (testset)     0.7310  0.7196  0.7332  0.7218  0.7268  0.7265  0.0052  
Fit time          0.22    0.24    0.26    0.63    0.33    0.34    0.15    
Test time         1.88    1.76    1.87    4.04    3.80    2.67    1.02    


{'test_rmse': array([0.95450433, 0.93809886, 0.95730252, 0.94360524, 0.95049599]),
 'test_mae': array([0.73100664, 0.71957303, 0.73317775, 0.72180869, 0.72683728]),
 'fit_time': (0.22396254539489746,
  0.24393701553344727,
  0.2599318027496338,
  0.6281595230102539,
  0.3282637596130371),
 'test_time': (1.8798508644104004,
  1.756166696548462,
  1.8721559047698975,
  4.036506414413452,
  3.795743465423584)}

`RMSE` (Root Mean Squared Error): on average, it tells how far off our predictions are to the real ratings. The lower the better.
`MAE` (Mean Absolute Error)

In [38]:
# from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
# from surprise.model_selection import GridSearchCV

# sim_options = {
#     "name": ["msd", "cosine"],
#     "min_support": [3, 4, 5],
#     "user_based": [False, True],
# }

# param_grid = {"sim_options": sim_options}

# gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse", "mae"], cv=3)
# gs.fit(data)

# print(gs.best_score["rmse"])
# print(gs.best_params["rmse"])

#### Model-based approach

* One commonly used technique is called matrix factorization, which basically means decomposing the ratings matrix into two matrices with lower dimensions. These two matrices are called user and item embedding matrices. 
* A couple of ways to implement matrix factorization: singular value decomposition (SVD), non-negative matrix factorization (NMF)

![matrix factorization diagram](matrix_factorization_diagram.png)

In [27]:
from surprise import Reader, Dataset

# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [33]:
from surprise import SVD, SVDpp, NMF
from surprise.model_selection import cross_validate

algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8745  0.8696  0.8735  0.8759  0.8746  0.8736  0.0022  
MAE (testset)     0.6724  0.6695  0.6706  0.6751  0.6705  0.6716  0.0020  
Fit time          8.60    8.33    7.85    8.88    8.49    8.43    0.34    
Test time         0.31    0.18    0.19    0.16    0.19    0.21    0.05    


{'test_rmse': array([0.87451302, 0.86955334, 0.87349499, 0.8759069 , 0.87461115]),
 'test_mae': array([0.67241151, 0.66946217, 0.67062037, 0.67509422, 0.67049785]),
 'fit_time': (8.598172187805176,
  8.334353923797607,
  7.853757381439209,
  8.882915019989014,
  8.494570970535278),
 'test_time': (0.3128211498260498,
  0.18091535568237305,
  0.1868915557861328,
  0.15892934799194336,
  0.19189023971557617)}

In [None]:
algo = SVDpp()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [34]:
algo = NMF()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9268  0.9200  0.9174  0.9248  0.9272  0.9232  0.0039  
MAE (testset)     0.7108  0.7030  0.7036  0.7089  0.7108  0.7074  0.0034  
Fit time          10.64   10.49   9.42    9.49    8.62    9.73    0.75    
Test time         0.22    0.26    0.16    0.14    0.18    0.19    0.04    


{'test_rmse': array([0.92683124, 0.91999016, 0.91737801, 0.92475923, 0.92724557]),
 'test_mae': array([0.710831  , 0.70297678, 0.70361805, 0.70885404, 0.71077695]),
 'fit_time': (10.638836145401001,
  10.486591339111328,
  9.415008068084717,
  9.49497938156128,
  8.616158962249756),
 'test_time': (0.21887469291687012,
  0.25686168670654297,
  0.1628892421722412,
  0.13991928100585938,
  0.17589926719665527)}

In [None]:
# from surprise.model_selection import GridSearchCV
# from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
# from surprise import SVD, SVDpp, NMF
# from surprise import SlopeOne, CoClustering

#### Comparisons