![Background](https://hackernoon.com/hn-images/1*PsI17WdbeL1OUyhD5H6JMQ.png)

Based on previous(past) behaviours, it predicts the likelihood that a user would prefer an item.
For example, Netflix uses recommendation system. It suggest people new movies according to their past activities that are like watching and voting movies.
The purpose of recommender systems is recommending new things that are not seen before from people.

# Movie Recommendation System
Tugas 1 Big Data Analysis

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error

## Load Dataset

In [2]:
links_df = pd.read_csv("dataset/links.csv")
movies_df = pd.read_csv("dataset/movies.csv")
ratings_df = pd.read_csv("dataset/ratings.csv")
tags_df = pd.read_csv("dataset/tags.csv")

# Identify Dataset

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies_df.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


# Collaborative Filtering: Item Based

![collab-item](https://image.ibb.co/maEQdd/resim_b.jpg)

In [8]:
main_df = links_df.merge(movies_df, on='movieId')
main_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844.0,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602.0,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862.0,Father of the Bride Part II (1995),Comedy


In [9]:
main_df = ratings_df.merge(movies_df, on="movieId")
main_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [10]:
pt = main_df.pivot_table(index='userId', columns='title', values='rating')
pt.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [11]:
watched_movies = pt['Toy Story (1995)']
similar_movies = pt.corrwith(watched_movies)
similar_movies = similar_movies.sort_values(ascending=False)
similar_movies.head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


title
Land Before Time III: The Time of the Great Giving (1995)    1.0
Orlando (1992)                                               1.0
Goosebumps (2015)                                            1.0
Encounters at the End of the World (2008)                    1.0
Suburban Commando (1991)                                     1.0
dtype: float64

# Content Based Filtering

In [12]:
main_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [13]:
movies_df = main_df.groupby('movieId').rating.mean().reset_index().merge(movies_df, on="movieId")
movies_df.head()

Unnamed: 0,movieId,rating,title,genres
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,Grumpier Old Men (1995),Comedy|Romance
3,4,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,3.071429,Father of the Bride Part II (1995),Comedy


## Extract Genre

In [14]:
genre_list = []
for genres in movies_df.loc[:, 'genres']:
    split_genres = genres.split('|')
    for genre in split_genres:
        if not genre in genre_list:
            genre_list.append(genre)

print(genre_list, len(genre_list))

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir', '(no genres listed)'] 20


## Feature Selection

In [15]:
temp_df = movies_df.copy()
for genre in genre_list:
    true_values = movies_df.loc[movies_df.genres.str.contains(genre)].movieId.tolist()
    new_values = [ 1 if movie_id in true_values else 0 for movie_id in movies_df.loc[:, 'movieId'] ]
    temp_df[genre] = new_values
temp_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movieId,rating,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,3.431818,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,3.259615,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,3.071429,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train = temp_df.drop(['movieId','genres', 'title'], axis=1)
X_train.head()

Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,3.92093,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.431818,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.259615,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.357143,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.071429,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y_train = temp_df.loc[:, ['movieId']]
y_train.head()

Unnamed: 0,movieId
0,1
1,2
2,3
3,4
4,5


## Build model with default distance metric: Minkowski Distance P=2 

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors

In [18]:
model = NearestNeighbors(n_neighbors=7).fit(X_train)

In [19]:
distances, indices = model.kneighbors(X_train.loc[0].values.reshape(1, -1))

In [20]:
distances, indices

(array([[0.        , 0.04971811, 0.06010549, 0.20471402, 0.47093023,
         0.67648579, 0.89712071]]),
 array([[   0, 3563, 2353, 2996, 9412, 1705, 6469]], dtype=int64))

## Recommendation For Toy Story (1995)

In [21]:
movies_df.loc[indices[0]]

Unnamed: 0,movieId,rating,title,genres
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3563,4886,3.871212,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
2353,3114,3.860825,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2996,4016,3.716216,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
9412,166461,3.45,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy
1705,2294,3.244444,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
6469,53121,3.02381,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy


## More complex model with vote count

In [22]:
main_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [23]:
movies_df['voteCount'] = main_df.groupby('movieId').userId.count().reset_index().userId
movies_df.head()

Unnamed: 0,movieId,rating,title,genres,voteCount
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,2,3.431818,Jumanji (1995),Adventure|Children|Fantasy,110
2,3,3.259615,Grumpier Old Men (1995),Comedy|Romance,52
3,4,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance,7
4,5,3.071429,Father of the Bride Part II (1995),Comedy,49


In [24]:
temp_df['voteCount'] = movies_df.voteCount

In [25]:
temp_df.head()

Unnamed: 0,movieId,rating,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),voteCount
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,215
1,2,3.431818,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,110
2,3,3.259615,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,52
3,4,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,7
4,5,3.071429,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,49


In [26]:
X_train = temp_df.drop(['movieId','genres', 'title'], axis=1)
X_train.head()

Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),voteCount
0,3.92093,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,215
1,3.431818,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,110
2,3.259615,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,52
3,2.357143,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,7
4,3.071429,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,49


## Normalize Values

In [27]:
X_train['voteCount'] = MinMaxScaler().fit_transform(X_train['voteCount'].values.reshape(-1, 1))
X_train['rating'] = MinMaxScaler().fit_transform(X_train['rating'].values.reshape(-1, 1))
X_train.head()



Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),voteCount
0,0.760207,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.652439
1,0.651515,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.332317
2,0.613248,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.155488
3,0.412698,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0.018293
4,0.571429,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.146341


In [28]:
X_train.shape

(9724, 22)

# Build Model

In [29]:
model = NearestNeighbors(n_neighbors=7).fit(X_train)

In [30]:
distances, indices = model.kneighbors(X_train.loc[0].values.reshape(1, -1))

In [31]:
movies_df.loc[indices[0]]

Unnamed: 0,movieId,rating,title,genres,voteCount
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3563,4886,3.871212,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,132
2353,3114,3.860825,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,97
1705,2294,3.244444,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,45
2996,4016,3.716216,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,37
6469,53121,3.02381,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy,21
9412,166461,3.45,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy,10


In [32]:
distances, indices = model.kneighbors(X_train.loc[9715].values.reshape(1, -1))

In [33]:
movies_df.loc[indices[0]]

Unnamed: 0,movieId,rating,title,genres,voteCount
9715,193567,3.0,anohana: The Flower We Saw That Day - The Movi...,Animation|Drama,1
5553,26743,2.5,Only Yesterday (Omohide poro poro) (1991),Animation|Drama,1
8362,109596,4.0,Wrinkles (Arrugas) (2011),Animation|Drama,1
6581,55442,4.181818,Persepolis (2007),Animation|Drama,11
5842,32657,4.25,"Man Who Planted Trees, The (Homme qui plantait...",Animation|Drama,4
8859,134095,5.0,My Love (2006),Animation|Drama,1
5953,35347,1.0,Animal Farm (1954),Animation|Drama,1


# More complex model with `year`

## Extracts the year from the title
Search for title that doesn't have the year

In [34]:
invalid_title = {}
for title in movies_df.title:
    # removes extra spaces from title
    title = ' '.join(title.split())
    try:
        # is a valid year if doesn't throws error
        int(title[-5:-1])
    except:
        print(title)
        invalid_title[title] = 0

Babylon 5
Ready Player One
Hyena Road
The Adventures of Sherlock Holmes and Doctor Watson
Nocturnal Animals
Paterson
Moonlight
The OA
Cosmos
Maria Bamford: Old Baby
Generation Iron 2
Black Mirror


In [35]:
invalid_title

{'Babylon 5': 0,
 'Ready Player One': 0,
 'Hyena Road': 0,
 'The Adventures of Sherlock Holmes and Doctor Watson': 0,
 'Nocturnal Animals': 0,
 'Paterson': 0,
 'Moonlight': 0,
 'The OA': 0,
 'Cosmos': 0,
 'Maria Bamford: Old Baby': 0,
 'Generation Iron 2': 0,
 'Black Mirror': 0}

# Manually input year

In [36]:
invalid_title = {
    'Babylon 5': 1994,
    'Ready Player One': 2018,
    'Hyena Road': 2015,
    'The Adventures of Sherlock Holmes and Doctor Watson': 1980,
    'Nocturnal Animals': 2016,
    'Paterson': 2016,
    'Moonlight': 2016,
    'The OA': 2016,
    'Cosmos': 2015,
    'Maria Bamford: Old Baby': 2017,
    'Generation Iron 2': 2017,
    'Black Mirror': 2011
}

In [37]:
movies_df['title'] = movies_df.title.str.strip()

In [38]:
movies_df['year'] = movies_df.title.str[-5:-1]
movies_df.year.head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

In [39]:
movies_df.tail()

Unnamed: 0,movieId,rating,title,genres,voteCount,year
9719,193581,4.0,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1,2017
9720,193583,3.5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1,2017
9721,193585,3.5,Flint (2017),Drama,1,2017
9722,193587,3.5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1,2018
9723,193609,4.0,Andrew Dice Clay: Dice Rules (1991),Comedy,1,1991


In [40]:
for title in invalid_title.keys():
    movies_df.loc[movies_df.title == title, 'year'] = invalid_title[title]
movies_df.year.head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

In [41]:
movies_df.tail()

Unnamed: 0,movieId,rating,title,genres,voteCount,year
9719,193581,4.0,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1,2017
9720,193583,3.5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1,2017
9721,193585,3.5,Flint (2017),Drama,1,2017
9722,193587,3.5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1,2018
9723,193609,4.0,Andrew Dice Clay: Dice Rules (1991),Comedy,1,1991


In [42]:
movies_df['year'] = movies_df.year.astype(float)

In [43]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 9723
Data columns (total 6 columns):
movieId      9724 non-null int64
rating       9724 non-null float64
title        9724 non-null object
genres       9724 non-null object
voteCount    9724 non-null int64
year         9724 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 851.8+ KB


In [44]:
X_train['year'] = MinMaxScaler().fit_transform(movies_df.year.values.reshape(-1, 1))

In [45]:
X_train.tail()

Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),voteCount,year
9719,0.777778,0,1,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0.0,0.991379
9720,0.666667,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.991379
9721,0.666667,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0.0,0.991379
9722,0.666667,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0.0,1.0
9723,0.777778,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.767241


# Build Model

In [46]:
model = NearestNeighbors(n_neighbors=7).fit(X_train)
_, indices = model.kneighbors(X_train.loc[0].values.reshape(1, -1))

In [47]:
movies_df.loc[indices[0]]

Unnamed: 0,movieId,rating,title,genres,voteCount,year
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,1995.0
3563,4886,3.871212,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,132,2001.0
2353,3114,3.860825,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,97,1999.0
1705,2294,3.244444,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,45,1998.0
2996,4016,3.716216,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,37,2000.0
6469,53121,3.02381,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy,21,2007.0
9412,166461,3.45,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy,10,2016.0


# Root Mean Squared Error
the smaller the RMSE, the better.


![rmse_formula](https://www.includehelp.com/ml-ai/Images/rmse-1.jpg)

In [48]:
predicted_value = X_train.loc[indices[0]].mean()
predicted_value

rating                0.685315
Adventure             1.000000
Animation             1.000000
Children              1.000000
Comedy                1.000000
Fantasy               1.000000
Romance               0.000000
Drama                 0.000000
Action                0.000000
Crime                 0.000000
Thriller              0.000000
Horror                0.000000
Mystery               0.000000
Sci-Fi                0.000000
War                   0.000000
Musical               0.000000
Documentary           0.000000
IMAX                  0.000000
Western               0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
voteCount             0.239547
year                  0.864532
dtype: float64

In [49]:
actual_value = X_train.loc[0]
actual_value

rating                0.760207
Adventure             1.000000
Animation             1.000000
Children              1.000000
Comedy                1.000000
Fantasy               1.000000
Romance               0.000000
Drama                 0.000000
Action                0.000000
Crime                 0.000000
Thriller              0.000000
Horror                0.000000
Mystery               0.000000
Sci-Fi                0.000000
War                   0.000000
Musical               0.000000
Documentary           0.000000
IMAX                  0.000000
Western               0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
voteCount             0.652439
year                  0.801724
Name: 0, dtype: float64

In [50]:
math.sqrt(mean_squared_error(predicted_value, actual_value))

0.08847336159225858

## Build Model without voteCount Feature

In [51]:
X_train = X_train.drop('voteCount', axis=1)
X_train.head()

Unnamed: 0,rating,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),year
0,0.760207,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.801724
1,0.651515,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.801724
2,0.613248,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.801724
3,0.412698,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0.801724
4,0.571429,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.801724


In [52]:
model = NearestNeighbors(n_neighbors=7).fit(X_train)
_, indices = model.kneighbors(X_train.loc[0].values.reshape(1, -1))

In [53]:
indices

array([[   0, 2353, 3563, 2996, 1705, 9412, 6469]], dtype=int64)

In [54]:
movies_df.loc[indices[0]]

Unnamed: 0,movieId,rating,title,genres,voteCount,year
0,1,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,1995.0
2353,3114,3.860825,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,97,1999.0
3563,4886,3.871212,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,132,2001.0
2996,4016,3.716216,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,37,2000.0
1705,2294,3.244444,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,45,1998.0
9412,166461,3.45,Moana (2016),Adventure|Animation|Children|Comedy|Fantasy,10,2016.0
6469,53121,3.02381,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy,21,2007.0


## RMSE 

In [55]:
predicted_value = X_train.loc[indices[0]].mean()
actual_value = X_train.loc[0]

math.sqrt(mean_squared_error(actual_value, predicted_value))

0.020838697839464175

In [56]:
X_train.to_csv('model.csv')