In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [2]:
movie_df = pd.read_csv('movieLens_dataset\movies.csv', encoding='latin-1')
user_df = pd.read_csv('movieLens_dataset\users.csv', encoding='latin-1')
rating_df = pd.read_csv('movieLens_dataset\ratings.csv',  encoding='latin-1')


In [3]:
rating_df = rating_df[['user_id', 'movie_id', 'rating']]

In [4]:
user_df = user_df [['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc']]

In [5]:
movie_df = movie_df.drop(columns = 'column2')

In [6]:
rating_df.shape, user_df.shape, movie_df.shape

((1000209, 3), (6040, 5), (3883, 3))

### Method 1. Content based 
The Content-Based Recommender relies on the similarity of the items being recommended. The basic idea is that if you like an item, then you will also like a “similar” item. It generally works well when it's easy to determine the context/properties of each item.

A content based recommender works with data that the user provides, either explicitly movie ratings for the MovieLens dataset. Based on that data, a user profile is generated, which is then used to make suggestions to the user. As the user provides more inputs or takes actions on the recommendations, the engine becomes more and more accurate.

In [7]:
movie_df.dtypes

movie_id     int64
title       object
genres      object
dtype: object

In [8]:
# Break up the big genre string into a string array
movie_df['genres'] = movie_df['genres'].str.split('|')

In [9]:
# Convert genres to string value
movie_df['genres'] = movie_df['genres'].fillna("").astype('str')

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movie_df['genres'])
tfidf_matrix.shape

(3883, 127)

In [11]:
movie_df.tail(10)

Unnamed: 0,movie_id,title,genres
3873,3943,Bamboozled (2000),['Comedy']
3874,3944,Bootmen (2000),"['Comedy', 'Drama']"
3875,3945,Digimon: The Movie (2000),"['Adventure', 'Animation', ""Children's""]"
3876,3946,Get Carter (2000),"['Action', 'Drama', 'Thriller']"
3877,3947,Get Carter (1971),['Thriller']
3878,3948,Meet the Parents (2000),['Comedy']
3879,3949,Requiem for a Dream (2000),['Drama']
3880,3950,Tigerland (2000),['Drama']
3881,3951,Two Family House (2000),['Drama']
3882,3952,"Contender, The (2000)","['Drama', 'Thriller']"


In [12]:
print(tfidf_matrix[0])

  (0, 29)	0.49792764065786643
  (0, 37)	0.40421250853461244
  (0, 45)	0.23523321955508567
  (0, 30)	0.521820712785304
  (0, 38)	0.5109293383032849


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
cos = cosine_similarity(tfidf_matrix)

In [14]:
cos[:4, :4]

array([[1.        , 0.14193614, 0.09010857, 0.1056164 ],
       [0.14193614, 1.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 ],
       [0.1056164 , 0.        , 0.1719888 , 1.        ]])

In [15]:
# Build a 1-dimensional array with movie titles
titles = movie_df['title']
indices = pd.Series(movie_df.index, index=movie_df['title'])

In [16]:
def genre_recommendations(title, top_n):
    idx = indices[title]
    sim_scores = list(enumerate(cos[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores_top = sim_scores[1:top_n]
    movie_indices = [i[0] for i in sim_scores_top]
    return titles.iloc[movie_indices]

In [17]:
genre_recommendations('Bamboozled (2000)', 10)

18                Ace Ventura: When Nature Calls (1995)
37                                  It Takes Two (1995)
51                              Mighty Aphrodite (1995)
62    Don't Be a Menace to South Central While Drink...
64                                      Bio-Dome (1996)
68                                        Friday (1995)
87                                   Black Sheep (1996)
94                        In the Bleak Midwinter (1995)
99                                 Bottle Rocket (1996)
Name: title, dtype: object

#### Conclusion:
* Use 2 ward gram TF-IDF to vactorize each movie type(genres), shape -> (number of movie, number of total types)
* Use cosine similarity to calculate the distance between each movie based on its genres, shape -> (number of movie, number of movie) -> each row/column represents a movie, the intersection of row and column is the similar score for two movies.
* Use a function to rank all similarity scores and pick the top n score, which are the most similar movies to the given one.

### Model 2. Collaborative Filtering - Memory & User Based

In [18]:
rating_df.count()

user_id     1000209
movie_id    1000209
rating      1000209
dtype: int64

In [23]:
rating_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [24]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(rating_df[:2000], test_size=0.2)

In [25]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

In [26]:
from sklearn.metrics import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.99998629 0.99999732 0.99999774]
 [0.99998629 1.         0.99997147 0.99997288]
 [0.99999732 0.99997147 1.         0.99999998]
 [0.99999774 0.99997288 0.99999998 1.        ]]


In [28]:
user_correlation.shape

(1600, 1600)

#### Conclusion:
Pros:

* Easy to implement
* Produce reasonable prediction quality

Cons:

* Doesn't address well-known cold-start problem, when new users or new item enters the system.
* It can't deal with sparse data, meaning it's hard to find users that have reted the same items. 
* It tend to recommend popular items.
* Large computation power needed.
* Easy to overfit.

### Method 3. Model Based Collaborative Filtering - Matrix Factorization
* Unsupervised learning method for latent variable decomposition and dimensionality reducion.
* Model learns to factorize rating matrix into user and movie representations (Rating matrix = User Matrix/(n_of_user * latent) * Item Matrix(latent * n_of_item)

In [30]:
#Now I want the format of my ratings matrix to be one row per user and one column per movie. 
#To do so, I'll pivot ratings to get that and call the new variable Ratings (with a capital *R).

rating_pivot = rating_df.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)

In [33]:
rating_pivot.shape

(6040, 3706)

In [42]:
#Last but not least, I need to de-normalize the data (normalize by each users mean) 
#and convert it from a dataframe to a numpy array.

rating_pivot_matrix = rating_pivot.as_matrix()
user_rating_mean = np.mean(rating_pivot_matrix, axis = 1)
rating_normal = rating_pivot_matrix - user_rating_mean.reshape(-1,1)

#### SVD

In [44]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(rating_normal, k = 50)

In [47]:
U.shape, sigma.shape, Vt.shape

((6040, 50), (50,), (50, 3706))

In [50]:
sigma = np.diag(sigma)

#### Making Predictions from the Decomposed Matrices

In [54]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_rating_mean.reshape(-1, 1)
#preds = pd.DataFrame(all_user_predicted_ratings, columns = rating_pivot_matrix.columns)

MemoryError: 

In [61]:
prediction = pd.DataFrame(all_user_predicted_ratings, columns = rating_pivot.columns)

In [62]:
prediction.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [72]:
def recommend_movies(movie_df, rating_df, prediction, userID, num_recommend):
    user_row_num = userID - 1
    sorted_user_prediction = prediction.iloc[user_row_num].sort_values(ascending = False)
    
    # Get the user's data and merge in the movie information.
    user_data = rating_df[rating_df.user_id == (userID)]
    user_full = (user_data.merge(movie_df, how = 'left', left_on = 'movie_id', right_on = 'movie_id').
                     sort_values(['rating'], ascending=False))
    
    recommendations = (movie_df[~movie_df['movie_id'].isin(user_full['movie_id'])].merge(pd.DataFrame(sorted_user_prediction).reset_index(), how = 'left',
               left_on = 'movie_id', right_on = 'movie_id').rename(columns = {user_row_num: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommend, :-1])
    
    return user_full, recommendations
    
    

In [74]:
watched, recommend = recommend_movies(movie_df, rating_df, prediction, 20, 10)

In [75]:
watched

Unnamed: 0,user_id,movie_id,rating,title,genres
14,20,3578,5,Gladiator (2000),"['Action', 'Drama']"
21,20,110,5,Braveheart (1995),"['Action', 'Drama', 'War']"
20,20,3753,5,"Patriot, The (2000)","['Action', 'Drama', 'War']"
5,20,1912,5,Out of Sight (1998),"['Action', 'Crime', 'Romance']"
6,20,2571,5,"Matrix, The (1999)","['Action', 'Sci-Fi', 'Thriller']"
16,20,1617,5,L.A. Confidential (1997),"['Crime', 'Film-Noir', 'Mystery', 'Thriller']"
11,20,1527,5,"Fifth Element, The (1997)","['Action', 'Sci-Fi']"
15,20,47,5,Seven (Se7en) (1995),"['Crime', 'Thriller']"
0,20,648,4,Mission: Impossible (1996),"['Action', 'Adventure', 'Mystery']"
22,20,2028,4,Saving Private Ryan (1998),"['Action', 'Drama', 'War']"


In [76]:
recommend

Unnamed: 0,movie_id,title,genres
1557,1610,"Hunt for Red October, The (1990)","['Action', 'Thriller']"
3701,3793,X-Men (2000),"['Action', 'Sci-Fi']"
472,480,Jurassic Park (1993),"['Action', 'Adventure', 'Sci-Fi']"
48,50,"Usual Suspects, The (1995)","['Crime', 'Thriller']"
2828,2916,Total Recall (1990),"['Action', 'Adventure', 'Sci-Fi', 'Thriller']"
3466,3555,U-571 (2000),"['Action', 'Thriller']"
1190,1214,Alien (1979),"['Action', 'Horror', 'Sci-Fi', 'Thriller']"
1528,1580,Men in Black (1997),"['Action', 'Adventure', 'Comedy', 'Sci-Fi']"
1176,1200,Aliens (1986),"['Action', 'Sci-Fi', 'Thriller', 'War']"
254,260,Star Wars: Episode IV - A New Hope (1977),"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"


#### Evaluation

In [80]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD, evaluate

ModuleNotFoundError: No module named 'surprise'

##### Conclusion:
Pros:
* Better deal with scalability and sparsity than memory-based CF.

Cons:
* Lose meaningful signals by using low-rank approximation. 
* There's an interpretability problem as a singular vector specifies a linear combination of all input columns or rows.
* There's also a lack of sparsity when the singular vectors are quite dense.
* Thus, SVD approach is limited to linear projections.


### Model 4. Deep Learning

#### Data Preparation
* change encode start from 0
* train_test_split
* array
* number of factor

In [84]:
#Change data into sequential for future use
from sklearn.preprocessing import LabelEncoder
user_encode = LabelEncoder()
rating_df['user'] = user_encode.fit_transform(rating_df['user_id'].values)
n_users = rating_df['user'].nunique()

In [87]:
item_encode = LabelEncoder()
rating_df['movie'] = item_encode.fit_transform(rating_df['movie_id'].values)
n_movies = rating_df['movie'].nunique()

In [89]:
rating_df['rating'] = rating_df['rating'].values.astype(np.float32)
min_rating = min(rating_df['rating'])
max_rating = max(rating_df['rating'])

In [90]:
n_users, n_movies, min_rating, max_rating

(6040, 3706, 1.0, 5.0)

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(rating_df[['user', 'movie']].values, rating_df['rating'].values, \
                                                    test_size = 0.3, random_state = 9)

In [92]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700146, 2), (300063, 2), (700146,), (300063,))

In [98]:
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [103]:
n_factor = 50

#### Keras model