# Nearest Neighbor item based Collaborative Filtering

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Read datastes:   
https://grouplens.org/datasets/movielens/latest/

In [13]:
movies = pd.read_csv('movies.csv', usecols=['movieId', 'title'])

In [14]:
movies.head(2)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)


In [16]:
ratings = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'])

In [17]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0


In [18]:
movies.dtypes

movieId     int64
title      object
dtype: object

In [19]:
ratings.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [20]:
ratings.shape, movies.shape

((100836, 3), (9742, 2))

In [21]:
df = pd.merge(ratings, movies, on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [22]:
df.shape

(100836, 4)

In [23]:
df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

### data Prep

In [28]:
movie_ratings_counts = df.groupby(['title'], as_index=False)['rating'].count()
movie_ratings_counts.columns = ['title', 'rating_count']
movie_ratings_counts.head()

Unnamed: 0,title,rating_count
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [29]:
rating_with_total_count = pd.merge(df, movie_ratings_counts, left_on='title', right_on='title')
rating_with_total_count.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [32]:
round(movie_ratings_counts['rating_count'].describe(), 3)

count    9719.000
mean       10.375
std        22.406
min         1.000
25%         1.000
50%         3.000
75%         9.000
max       329.000
Name: rating_count, dtype: float64

In [33]:
popularity_threshold = 50
popularity_movie = rating_with_total_count.query('rating_count > @popularity_threshold')
popularity_movie.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [34]:
popularity_movie.shape

(40712, 5)

In [35]:
## First lets create a Pivot matrix
movie_features_df = popularity_movie.pivot_table(index='title', columns='userId', values='rating').fillna(0)
movie_features_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X-Men: The Last Stand (2006),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


### Model

In [38]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [37]:
movie_features_matrix = csr_matrix(movie_features_df.values)

In [39]:
model = NearestNeighbors(metric='cosine',algorithm='brute')
model.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [40]:
query_index = np.random.choice(movie_features_df.shape[0])
query_index

417

In [42]:
distance, indeces = model.kneighbors(movie_features_df.iloc[query_index, : ].values.reshape(1,-1),
                                     n_neighbors=6 )

In [43]:
indeces

array([[417, 205, 375, 349, 235, 258]], dtype=int64)

In [44]:
distance

array([[1.11022302e-16, 3.62727475e-01, 3.82979357e-01, 4.00132567e-01,
        4.27773575e-01, 4.37342276e-01]])

In [45]:
for i in range(0, len(distance.flatten())):
    if i ==0:
        print(i)
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indeces.flatten()[i]],
                                                       distance.flatten()[i]))

0
1: I, Robot (2004), with distance of 0.36272747514166936:
2: Star Wars: Episode III - Revenge of the Sith (2005), with distance of 0.382979357418356:
3: Signs (2002), with distance of 0.40013256670477193:
4: Last Samurai, The (2003), with distance of 0.4277735753466575:
5: Matrix Reloaded, The (2003), with distance of 0.4373422764212169:
