# Movie Lens Collaborative Filtering Movie-Recommendation System

## 1. Getting Data

### Data Source
[Movie Lens 100K](https://grouplens.org/datasets/movielens/100k/): [description](http://files.grouplens.org/datasets/movielens/ml-100k-README.txt)

### Data Preliminary

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

In [None]:
fname = 'data/ml-100k/u.data'
data = pd.read_csv( fname, sep= '\t', 
                   header = 0,
                   names = ['user_id', 'item_id', 'rating', 'timestamp']
                  )

In [None]:
data.head()

In [None]:
data.dtypes

#### Checking for Missing and Dup data

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')
plt.ylabel('missing data')

In [None]:
# Enter the combo of columns that should be unique here:
l_dup_check = ['user_id',
               'item_id',
               'rating'
              ]

data_dup = data[
            data.duplicated( 
                subset = l_dup_check, 
                keep = False)
            ]
print( f' Found { len ( data_dup )} duplicated records.')

if len(data_dup)> 0:
    data_dup.head(10)

In [None]:
u_users = data.user_id.unique()
u_movies = data.item_id.unique()

print(f'There are {len(u_users)} users.')
print(f'There are {len(u_movies)} movies.')

#### Grouping and Summarizing
* some [references here](https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/)
* get **User Rating Count Distribution**
* get **Movie Rating Count Distribution**

In [None]:
agg_user = data.groupby(['user_id']).agg({
        'item_id': {'movie_count':'count'},
        'rating': {'avg_rating': 'mean'}
    })
agg_movie = data.groupby(['item_id']).agg({
        'user_id': {'user_count':'count'},
        'rating': {'avg_rating': 'mean'}
    })

In [None]:
plt.figure( figsize = (15, 8))
sns.set(style = 'darkgrid', context = 'talk')
g = sns.distplot( 
            agg_user[('item_id','movie_count')],
            kde = False, bins = 50
        )
g.set_title("User's Review Count Distribution")

In [None]:
plt.figure( figsize = (15, 8))
sns.set(style = 'darkgrid', context = 'talk')
g = sns.distplot( 
            agg_movie[('user_id','user_count')],
            kde = False, bins = 100
        )
g.set_title("Movie's Reviews Count Distribution")

In [None]:
plt.figure( figsize = (15, 8))
sns.set(style = 'darkgrid', context = 'talk')
g = sns.distplot( 
            agg_movie[('rating','avg_rating')],
            kde = False, bins = 50
        )
g.set_title("Movie's Rating Distribution")

#### Seems like we should cut out movies with less than 50 reviews
note that from the dataset, all users have at least 20 views

In [None]:
n_count = 10
plt.figure( figsize = (15, 8))
sns.set(style = 'darkgrid', context = 'talk')
g = sns.distplot( 
            agg_movie[agg_movie['user_id', 'user_count'] > n_count][('user_id','user_count')],
            kde = False, bins = 100
        )
g.set_title(f"Movie's Reviews Count Distribution with cutoff at {n_count} reviews")


## 2. Transpose Data into Matrix

on how to index user_id from a multiindex column, check [here](https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html)

In [None]:
n_count = 50
filtered_item_id = agg_movie[agg_movie['user_id', 'user_count'] > n_count].index

In [None]:
data['keep_item'] = data.item_id.apply( lambda x: x in list(filtered_item_id))
raw = data[data.keep_item == True].loc[:,['user_id','item_id','rating']]

In [None]:
ui_matrix = pd.pivot_table( raw,
                          values = 'rating',
                          index = 'user_id',
                            columns= ['item_id']
                          )
print(f'Filtered Dataset is of the shape (# of User, # of Movies): {ui_matrix.shape}')

## 3. Finding Distance between Users

#### Let's get two random users

In [None]:
random = ui_matrix.sample(2, random_state = 420).index.values
user1, user2 = tuple(random)
print(f'User1 id: {user1} and User2 id:{user2}')

#### We will use hamming distance
* [reference from scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.hamming.html#scipy.spatial.distance.hamming) and math expression is [this](https://en.wikipedia.org/wiki/Distance_correlation#Distance_correlation)

In [None]:
from scipy.spatial.distance import hamming

def ham_distance( userid_1, userid_2, user_item_matrix):
    try:
        u1 = user_item_matrix[ui_matrix.index == userid_1]
        u2 = user_item_matrix[ui_matrix.index == userid_2]
        distance = hamming( u1, u2)
    except:
        distance = np.NAN
    return distance

In [None]:
ham_distance(user1, user2, ui_matrix)

## 4. Get Recommendations
* find closest neighbors
* get average movies' rating of neighbors
* recommend top `N` movies by ranking

In [None]:
def GetNeighbors(ui_mat, user_id, knearest = 10):
    df_all = pd.DataFrame(ui_mat.index)
    df_all = df_all[df_all.user_id != user_id]
    df_all['distance'] = df_all.user_id.apply( lambda x : ham_distance(user_id, x, ui_mat))
    
    # ascending = True because hamming distance represent the percentage of the array U, V
    #   that is different; therefore, smaller hamming distance means more similar arrays
    neighbors = df_all.sort_values(['distance'], ascending=True) 
    return neighbors[: min(knearest, len(neighbors))]

In [None]:
GetNeighbors( ui_matrix, user1)

In [None]:
def GetItemSuggest(ui_mat, user_id, N = 5):
    neighbors_id = GetNeighbors(ui_mat, user_id, 10)['user_id']
    neighbors_data = ui_mat[ui_mat.index.isin(list(neighbors_id)) ]
    neighbors_items_rating = neighbors_data.apply( np.nanmean )
    
    OldItems = ui_mat.T[user_id].dropna().index
    NewItems = neighbors_items_rating[~ neighbors_items_rating.index.isin(list(OldItems))]
    NewItems_sorted = NewItems.sort_values(ascending = False).index   #<- comment out .index to see the avgRating
    
    return list(NewItems_sorted[ : min(N, len(NewItems_sorted))])
    

In [None]:
GetItemSuggest( ui_matrix, user1, 10)

## 5. Evaluation
#### Get Data about Items and Users to see if it make sense

In [None]:
fname = 'data/ml-100k/u.item'
idata = pd.read_csv( fname, sep= '|', 
                   header = 0,
                   names = [
                       'item_id','movie_title','release_date','video_release_date',
                       'IMDb_URL','unknown','Action','Adventure','Animation','Children',
                       'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
                       'Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'],
                    encoding = 'iso-8859-1'
                  )

#### What do we suggest?

In [None]:
Movies = GetItemSuggest( ui_matrix, user2, 10)
idata[ idata.item_id.isin(Movies)][['item_id','movie_title']]

#### What Does He currently Like?

In [None]:
def GetUserMovieList(ui_mat, user_id, df_item):
    OldItems = ui_mat.T[user_id].dropna().index
    return df_item[ df_item.item_id.isin(OldItems)][['item_id','movie_title']]

In [None]:
GetUserMovieList(ui_matrix, user2, idata)

#### Some Basic Info about this User

In [None]:
fname = 'data/ml-100k/u.user'
udata = pd.read_csv( fname, sep= '|', 
                       header = 0,
                       names = ['user_id','age','gender','occupation','zip_code'],
                        encoding = 'iso-8859-1'
                  )

In [None]:
udata[udata.user_id == user2]