In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.sparse import csr_matrix
import sklearn
import itertools

In [2]:
data = pd.read_csv("movie_recommendation.csv")
movies = pd.read_csv("movies2.csv")

## Naiev collaborative filltering

In [3]:
movie_user_mat = data.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [4]:
movie_user_mat

userId,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,601.0,602.0,603.0,604.0,605.0,606.0,607.0,608.0,609.0,610.0
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [6]:
movie_user_mat_sparse

<9472x610 sparse matrix of type '<class 'numpy.float64'>'
	with 96649 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.neighbors import NearestNeighbors
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [8]:
df_movies = movies[["movieId","title"]]

In [9]:
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}

In [10]:
from fuzzywuzzy import fuzz

def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]



In [11]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    
    # get input movie index
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    test_array = []
    for i, (idx, dist) in enumerate(raw_recommends):
        test_array.append(reverse_mapper[idx])
    
    return test_array

In [12]:
test_array = []

def test(my_favorite):
        test_array.append(make_recommendation(
        model_knn=model_knn,
        data=movie_user_mat_sparse,
        fav_movie=my_favorite,
        mapper=movie_to_idx,
        n_recommendations=10))
    
my_favorite = ["jumanji"]
for movie in my_favorite:
    test(movie)

Found possible matches in our database: ['Jumanji (1995)']

Recommendations for jumanji:


In [13]:
test_array

[['Ace Ventura: When Nature Calls (1995)',
  'Santa Clause, The (1994)',
  'Beauty and the Beast (1991)',
  'Nightmare Before Christmas, The (1993)',
  'Aladdin (1992)',
  'Home Alone (1990)',
  'Mask, The (1994)',
  'Jurassic Park (1993)',
  'Mrs. Doubtfire (1993)',
  'Lion King, The (1994)']]

## -----------------------------------------------------------------------------------------------

In [14]:
data2 = data.head(10000)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data2["genres"])

In [16]:
tfidf_matrix.shape

(10000, 80)

In [17]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
smd = df_movies.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [19]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [20]:
get_recommendations("Toy Story (1995)").head(10)

1                         Jumanji (1995)
2                Grumpier Old Men (1995)
3               Waiting to Exhale (1995)
4     Father of the Bride Part II (1995)
5                            Heat (1995)
6                         Sabrina (1995)
7                    Tom and Huck (1995)
8                    Sudden Death (1995)
9                       GoldenEye (1995)
10        American President, The (1995)
Name: title, dtype: object

## ---------------------------------------------------------------------------------

In [21]:
piv_norm = movie_user_mat
piv_norm

userId,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,601.0,602.0,603.0,604.0,605.0,606.0,607.0,608.0,609.0,610.0
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm = piv_norm.T

In [23]:
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)

In [24]:
user_similarity = cosine_similarity(piv_sparse)

In [25]:
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.index, columns = piv_norm.index)

In [26]:
user_sim_df

userId,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,601.0,602.0,603.0,604.0,605.0,606.0,607.0,608.0,609.0,610.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.000000,0.027286,0.054506,0.188743,0.129198,0.127114,0.159343,0.137090,0.064616,0.012239,...,0.080563,0.164718,0.215701,0.070704,0.150887,0.163771,0.269560,0.290949,0.093582,0.145390
2.0,0.027286,1.000000,0.000000,0.000000,0.016628,0.025355,0.023385,0.027279,0.000000,0.060316,...,0.202671,0.016891,0.012037,0.000000,0.000000,0.028435,0.012955,0.046306,0.027565,0.102464
3.0,0.054506,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.020098,0.000000,0.010780,0.008009,0.014180,0.015748,0.000000,0.030487
4.0,0.188743,0.000000,0.000000,1.000000,0.125632,0.084651,0.113871,0.058278,0.011464,0.027190,...,0.080414,0.124982,0.299304,0.050670,0.083024,0.193875,0.127919,0.140897,0.028727,0.104958
5.0,0.129198,0.016628,0.000000,0.125632,1.000000,0.296444,0.108826,0.429753,0.000000,0.025836,...,0.068103,0.411881,0.108121,0.259083,0.149124,0.105579,0.153069,0.134871,0.261442,0.060863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,0.163771,0.028435,0.008009,0.193875,0.105579,0.101351,0.198096,0.099488,0.075471,0.085583,...,0.178124,0.115949,0.295231,0.065622,0.147848,1.000000,0.152877,0.260416,0.069637,0.200900
607.0,0.269560,0.012955,0.014180,0.127919,0.153069,0.162407,0.186894,0.185383,0.011914,0.005469,...,0.092574,0.198941,0.197198,0.137961,0.119039,0.152877,1.000000,0.283024,0.149269,0.139239
608.0,0.290949,0.046306,0.015748,0.140897,0.134871,0.174140,0.321356,0.185901,0.098736,0.072221,...,0.158151,0.196293,0.224557,0.152884,0.176295,0.260416,0.283024,1.000000,0.121459,0.322199
609.0,0.093582,0.027565,0.000000,0.028727,0.261442,0.214419,0.091173,0.424322,0.000000,0.016970,...,0.035653,0.335730,0.059502,0.236695,0.097772,0.069637,0.149269,0.121459,1.000000,0.053245


In [32]:
def sim_movies(users, user_movie):
    movies = []
    for user in users:
        movies.append(data[data["userId"] == user[0]]["title"])
    movies = list(itertools.chain.from_iterable(movies))
    movies = list(set(movies))
    movies = set(movies) - set(user_movie)
    return movies

In [33]:
def top_users(user):
    users = []
    if user not in user_sim_df.columns:
        return('No data available on user {}'.format(user))
    
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        user_sim = (user, sim)
        users.append(user_sim)
    user_movie = data[data["userId"] == user]["title"]
    movies = sim_movies(users, user_movie)
    return movies

In [76]:
movies = top_users(610)

In [77]:
len(movies)

4530

In [78]:
movies

{'House of 1000 Corpses (2003)',
 'Poltergeist (1982)',
 'Cape Fear (1962)',
 'Major League (1989)',
 'Manhattan Project, The (1986)',
 'Limey, The (1999)',
 'Three Men and a Little Lady (1990)',
 'Dead Again (1991)',
 'Guns of Navarone, The (1961)',
 'Heavyweights (Heavy Weights) (1995)',
 'Misérables, Les (2000)',
 'Be Kind Rewind (2008)',
 'Splash (1984)',
 "On Her Majesty's Secret Service (1969)",
 'Iron Monkey (Siu nin Wong Fei-hung ji: Tit Ma Lau) (1993)',
 "She's So Lovely (1997)",
 'Marine, The (2006)',
 'Twin Dragons (Shuang long hui) (1992)',
 'Cool Runnings (1993)',
 'Metallica: Some Kind of Monster (2004)',
 'Broken Arrow (1996)',
 'Banger Sisters, The (2002)',
 'Sunshine (2007)',
 'Prince of Persia: The Sands of Time (2010)',
 'Candy (2006)',
 'Gonzo: The Life and Work of Dr. Hunter S. Thompson (2008)',
 'Event Horizon (1997)',
 'Chocolat (2000)',
 'Fast Five (Fast and the Furious 5, The) (2011)',
 'Tuskegee Airmen, The (1995)',
 'Legend of Bagger Vance, The (2000)',
 'And