# Dataset

In [496]:
import numpy as np
import pandas as pd
from IPython.display import display

pd.reset_option("max_columns")
pd.reset_option("max_rows")

# load data
ratings = pd.read_csv("./data/ratings.csv")
items = pd.read_csv("./data/movies.csv")

# drop timestamp column
ratings = ratings.drop(['timestamp'], axis=1)

In [497]:
print("Rating Dataframe")
display(ratings.head())
print('Numbers of ratings: {}'.format(len(ratings.index)))

print("\nItem Dataframe")
display(items.head())
print('Numbers of items: {}'.format(len(items.index)))

Rating Dataframe


Unnamed: 0,user_id,item_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


Numbers of ratings: 100836

Item Dataframe


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Numbers of items: 9742


# Popular-based


In [498]:
def weighted_rating(v, m, R, C):
    '''
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)

    Source: IMDB
    '''
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)

def calculate_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    # group ratings by item_id
    vote_count = (
        rating_df
        .groupby(item_col, as_index=False)
        .agg({user_col: 'count', rating_col: 'mean'}))
    
    vote_count.columns = [item_col, 'vote_count', 'avg_rating']
    
    # calculate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v, m, R, C)
    
    # merge DataFrame (left join)
    vote_count = vote_count.merge(item_df, on = [item_col], how = 'left')
    popular_items = vote_count.loc[:,[item_col, 'genres', 'vote_count', 'avg_rating', 'weighted_rating']]

    return popular_items

# init constant
USER_COL = 'user_id'
ITEM_COL = 'item_id'
RATING_COL = 'rating'

# calculate popular_based score and sort descending
popular_items = calculate_popular_based_score(ratings, items, USER_COL, ITEM_COL, RATING_COL)
popular_items = pd.merge(items, popular_items)
popular_items = popular_items.sort_values('weighted_rating', ascending = False)

display(popular_items.head(10))

Unnamed: 0,item_id,title,genres,vote_count,avg_rating,weighted_rating
183,318,"Shawshank Redemption, The (1994)",Crime|Drama,317,4.429022,4.403818
408,858,"Godfather, The (1972)",Crime|Drama,192,4.289062,4.25295
1276,2959,Fight Club (1999),Action|Crime|Drama|Thriller,218,4.272936,4.241498
569,1221,"Godfather: Part II, The (1974)",Crime|Drama,129,4.25969,4.208361
42,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745,4.205389
152,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251,4.231076,4.204795
374,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97,4.268041,4.200357
561,1213,Goodfellas (1990),Crime|Drama,126,4.25,4.198024
294,527,Schindler's List (1993),Drama|War,220,4.225,4.195318
2480,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149,4.238255,4.194469


# Content-based

In [499]:
from sklearn.metrics.pairwise import cosine_similarity

# get items that have been rated
rated_items = items.loc[items[ITEM_COL].isin(ratings[ITEM_COL])].copy()

# split and extract genres
genre = rated_items['genres'].str.split("|", expand=True)

# get all distinct genre
all_genre = set()
for column in genre.columns:
    distinct_genre = genre[column].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove('(no genres listed)')
all_genre.remove(None)

# create item-genre matrix
item_genre_mat = rated_items[[ITEM_COL, 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# create genres column 
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)

item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index(ITEM_COL)

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

# get dictionary with key is name and value is index
ind2name = { index: name for index, name in enumerate(item_genre_mat.index)} 
name2ind = { name: index for index, name in ind2name.items() }

# get top-k similar items
def top_k_items(item_id, top_k, corr_mat, dict_name):
    # sort and get top k correlation value
    top_items = corr_mat[item_id, :].argsort()[-top_k:][::-1]
    top_items = [dict_name[index] for index in top_items]
    return top_items

RECOMMEND_ITEM_ID = 1

similar_items = top_k_items(name2ind[RECOMMEND_ITEM_ID],
                            top_k = 25,
                            corr_mat = corr_mat,
                            dict_name = ind2name)

# display result
print('Item ID {}'.format(RECOMMEND_ITEM_ID))
display(items.loc[items['item_id'] == RECOMMEND_ITEM_ID])
print("Top 25 similar movie to item ID {}".format(RECOMMEND_ITEM_ID))
display(items.loc[items[ITEM_COL].isin(similar_items)])

del corr_mat

Item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Top 25 similar movie to item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
559,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...
1706,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2809,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3000,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3568,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
5490,26340,"Twelve Tasks of Asterix, The (Les douze travau...",Action|Adventure|Animation|Children|Comedy|Fan...
5977,36397,Valiant (2005),Adventure|Animation|Children|Comedy|Fantasy|War


# Collaborative Filtering

## User-based

In [500]:
user_based_dataset = ratings.pivot(index='item_id',columns='user_id',values='rating')
user_based_dataset = user_based_dataset - user_based_dataset.mean()

user_based_dataset.fillna(0,inplace=True)

user_based_dataset

user_id,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,0.0,0.0,0.363636,0.000000,1.269737,0.000000,0.0,0.0,...,-0.425743,0.000000,0.492047,-0.48,0.789593,-1.157399,0.213904,-0.634176,-0.27027,1.311444
2,0.000000,0.0,0.0,0.0,0.000000,0.506369,0.000000,0.425532,0.0,0.0,...,0.000000,0.607407,0.000000,1.52,0.289593,0.000000,0.000000,-1.134176,0.00000,0.000000
3,-0.366379,0.0,0.0,0.0,0.000000,1.506369,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,-1.134176,0.00000,0.000000
4,0.000000,0.0,0.0,0.0,0.000000,-0.493631,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
5,0.000000,0.0,0.0,0.0,0.000000,1.506369,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,-0.48,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
193583,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
193585,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000
193587,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


In [501]:
no_user_voted = ratings.groupby('item_id')['rating'].agg('count')
no_movies_voted = ratings.groupby('user_id')['rating'].agg('count')

user_based_dataset = user_based_dataset.loc[no_user_voted[no_user_voted > 10].index,:]
user_based_dataset = user_based_dataset.loc[:,no_movies_voted[no_movies_voted > 50].index]

user_based_dataset

user_id,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,0.000000,1.269737,0.0,0.00000,-0.948148,0.0,0.290476,-0.232072,...,-0.491481,-0.425743,0.000000,0.492047,-0.48,0.789593,-1.157399,0.213904,-0.634176,1.311444
2,0.000000,0.0,0.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,-0.732072,...,1.008519,0.000000,0.607407,0.000000,1.52,0.289593,0.000000,0.000000,-1.134176,0.000000
3,-0.366379,0.0,1.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,-1.134176,0.000000
5,0.000000,0.0,1.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,-0.491481,0.000000,0.000000,0.000000,-0.48,0.000000,0.000000,0.000000,0.000000,0.000000
6,-0.366379,0.0,0.506369,0.000000,0.0,1.21875,0.000000,0.0,0.000000,0.267928,...,0.000000,0.000000,-0.392593,0.492047,-0.48,0.000000,0.000000,0.000000,0.000000,1.311444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,0.000000,-0.425743,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
176371,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,0.000000,-0.425743,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
177765,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.074257,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
179819,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000


In [502]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

csr_data = csr_matrix(user_based_dataset.values)
user_based_dataset.reset_index(inplace=True)

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)
display(user_based_dataset)

user_id,item_id,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,-0.366379,0.0,0.000000,1.269737,0.0,0.00000,-0.948148,0.0,0.290476,...,-0.491481,-0.425743,0.000000,0.492047,-0.48,0.789593,-1.157399,0.213904,-0.634176,1.311444
1,2,0.000000,0.0,0.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,1.008519,0.000000,0.607407,0.000000,1.52,0.289593,0.000000,0.000000,-1.134176,0.000000
2,3,-0.366379,0.0,1.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,-1.134176,0.000000
3,5,0.000000,0.0,1.506369,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,-0.491481,0.000000,0.000000,0.000000,-0.48,0.000000,0.000000,0.000000,0.000000,0.000000
4,6,-0.366379,0.0,0.506369,0.000000,0.0,1.21875,0.000000,0.0,0.000000,...,0.000000,0.000000,-0.392593,0.492047,-0.48,0.000000,0.000000,0.000000,0.000000,1.311444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,174055,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,-0.425743,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
2117,176371,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,-0.425743,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
2118,177765,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.074257,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000
2119,179819,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000


In [503]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = items[items['title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['item_id']
        movie_idx = user_based_dataset[user_based_dataset['item_id'] == movie_idx].index[0]
        
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
        
        recommend_frame = []
        
        for val in rec_movie_indices:
            movie_idx = user_based_dataset.iloc[val[0]]['item_id']
            idx = items[items['item_id'] == movie_idx].index
            recommend_frame.append({'Title':items.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    
    else:
        
        return "No movies found. Please check your input"

In [504]:
get_movie_recommendation('Iron Man')

Unnamed: 0,Title,Distance
1,Up (2009),0.653929
2,In Bruges (2008),0.649277
3,Star Trek (2009),0.634218
4,"Dark Knight, The (2008)",0.631207
5,WALL·E (2008),0.625429
6,X-Men: First Class (2011),0.601108
7,"Avengers, The (2012)",0.597642
8,Kiss Kiss Bang Bang (2005),0.596239
9,Iron Man 2 (2010),0.576827
10,3:10 to Yuma (2007),0.566474


## Item-based

In [505]:
item_based_dataset = ratings.pivot(index='user_id',columns='item_id',values='rating')
item_based_dataset = item_based_dataset - item_based_dataset.mean()

item_based_dataset.fillna(0,inplace=True)

item_based_dataset.head(10)

item_id,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.07907,0.0,0.740385,0.0,0.0,0.053922,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.07907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.568182,1.740385,0.642857,1.928571,0.053922,0.814815,0.125,0.0,-0.496212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.57907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.568182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.496212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [506]:
no_user_voted = ratings.groupby('item_id')['rating'].agg('count')
no_movies_voted = ratings.groupby('user_id')['rating'].agg('count')

item_based_dataset = item_based_dataset.loc[:, no_user_voted[no_user_voted > 10].index]
item_based_dataset = item_based_dataset.loc[no_movies_voted[no_movies_voted > 50].index, :]
item_based_dataset.head(10)

item_id,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.07907,0.0,0.740385,0.0,0.053922,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.568182,1.740385,1.928571,0.053922,0.814815,0.0,-0.496212,0.328571,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.57907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,1.053922,0.0,0.0,-0.496212,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,-1.42093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.074074,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.57907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,-0.42093,-0.431818,0.0,0.0,0.053922,0.0,0.0,0.0,0.0,0.0,...,-0.136364,-0.980769,0.0,0.0,-0.28,0.0,0.0,0.0,0.0,0.0


In [507]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

csr_data = csr_matrix(item_based_dataset.T.values)
item_based_dataset.reset_index(inplace=True)

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [508]:
get_movie_recommendation('Iron Man')

Unnamed: 0,Title,Distance
1,WALL·E (2008),0.669245
2,Zootopia (2016),0.663697
3,"Avengers, The (2012)",0.656484
4,"Serious Man, A (2009)",0.655253
5,Up (2009),0.630405
6,Grindhouse (2007),0.590169
7,Ratatouille (2007),0.571993
8,Iron Man 2 (2010),0.567622
9,District 9 (2009),0.555558
10,Star Trek (2009),0.539605
