# Nearest Neighbor in Recommender System

### Import Packages and Dataframe
Let's begin with importing our data and also libraries we need.

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
movie = 'https://raw.githubusercontent.com/krishnaik06/Recommendation_complete_tutorial/master/KNN%20Movie%20Recommendation/movies.csv'
rating = 'https://raw.githubusercontent.com/krishnaik06/Recommendation_complete_tutorial/master/KNN%20Movie%20Recommendation/ratings.csv'

data_movies = pd.read_csv(movie)
data_ratings = pd.read_csv(rating)

Make sure there are no null data

In [3]:
data_movies.info(), data_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


(None, None)

In [4]:
data_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Merge our Dataframe
After looking at our dataframes, let's merge both data to become one frame based on their movieId, also select all columns we need.

In [6]:
data = pd.merge(data_ratings, data_movies, on='movieId')
data = data[['userId', 'movieId', 'rating', 'title']]
data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [7]:
data.shape

(100836, 4)

In [8]:
movie_rating_count = data.dropna(axis=0, subset=['title'])
movie_rating_count = (movie_rating_count.
                      groupby(by=['title'])['rating'].
                      count().
                      reset_index().
                      rename(columns={'rating':'rating_count'})
                      [['title', 'rating_count']])
movie_rating_count.head()

Unnamed: 0,title,rating_count
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [9]:
# Merge again, totalRatingCount to the "data" dataframe
data = data.merge(movie_rating_count, on='title', how='left')
data.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [10]:
data['rating_count'].describe()

count    100836.000000
mean         58.758777
std          61.965384
min           1.000000
25%          13.000000
50%          39.000000
75%          84.000000
max         329.000000
Name: rating_count, dtype: float64

### Filter Movie based on How Many People Rate It
Why we need to do this, it is because we have to prevent some movie that only got less count_rate with high rating.

In [11]:
rating_count = 50

data_filtered = data[data['rating_count'] > rating_count]
data_filtered.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [12]:
data_filtered.shape, data.shape

((40712, 5), (100836, 5))

### Pivot Matrix
The reason we create this is to see which userId has the similar movie interest.

In [13]:
pivot_matrix_data = data_filtered.pivot_table(index='title',
                                        columns='userId',
                                        values='rating').fillna(0)
pivot_matrix_data.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


### K Nearest Neighbors Model with Cosine Similarity
Using KNN for our model is to create a recommendation based on users similarity movie interest with cosine similarity as the measurement.

In [14]:
data_matrix = csr_matrix(pivot_matrix_data.values)

model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(data_matrix)

In [15]:
pivot_matrix_data.shape

(437, 606)

In [16]:
pd.DataFrame(pivot_matrix_data.index[0:10])

Unnamed: 0,title
0,10 Things I Hate About You (1999)
1,12 Angry Men (1957)
2,2001: A Space Odyssey (1968)
3,28 Days Later (2002)
4,300 (2007)
5,"40-Year-Old Virgin, The (2005)"
6,A.I. Artificial Intelligence (2001)
7,"Abyss, The (1989)"
8,Ace Ventura: Pet Detective (1994)
9,Ace Ventura: When Nature Calls (1995)


### Recommendation Result
We want to see which movie is related to others based on users interest by inputting movie index number.

In [17]:
movie_order_number = int(input("Input movie index number: "))

distances, indices = model.kneighbors(pivot_matrix_data.\
                                      iloc[movie_order_number,:].\
                                      values.reshape(1, -1),
                                      n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0 :
        print(f'\nRecommendation for \"{pivot_matrix_data.index[movie_order_number]}\"\n')
    else:
        print(f'{i}: {pivot_matrix_data.index[indices.flatten()[i]]}')

# The smaller cosine distance the more it is similar to the movie we chose
print(f'\nCosine Similarity Distance: {distances.flatten()[1:6]}')

Input movie number: 0

Recommendation for "10 Things I Hate About You (1999)"

1: Wedding Singer, The (1998)
2: Grease (1978)
3: Bridget Jones's Diary (2001)
4: Legally Blonde (2001)
5: Clueless (1995)

Cosine Similarity Distance: [0.55065744 0.57180068 0.57618562 0.59509439 0.60361817]


### Learning sources:
Krish Naik Tutorial 2- Creating Recommendation Systems using Nearest Neighbors
<br>https://www.youtube.com/watch?v=kccT0FVK6OY&list=PLZoTAELRMXVN7QGpcuN-Vg35Hgjp3htvi&index=7

---

Krish Naik - Cosine Similarity and Cosine Distance
<br>https://www.youtube.com/watch?v=ieMjGVYw9ag