## Movie Recommendation System

In [8]:
# Import all the neceesary Packages 
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD
from fuzzywuzzy import process
#from sklearn.externals import joblib

## Read Movie and Rating CSV Files using Pandas

In [18]:
import pandas as pd
from io import StringIO

movie_dataFrame = pd.read_csv(r'C:\Users\PAVAN\Desktop\python project\python_project\movie_recommendation-master\MyApp\movies.csv')
print(movie_dataFrame.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [21]:
rating_dataFrame = pd.read_csv(r'C:\Users\PAVAN\Desktop\python project\python_project\movie_recommendation-master\MyApp\ratings.csv')
print(rating_dataFrame.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


## Data Analysis

In [20]:
### we will first check the null values present in our data
print(movie_dataFrame.isnull().sum())
print()
print(rating_dataFrame.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


## Merge both the data frame and delete those columns which are not required

In [22]:
overall_movie_rating = pd.merge(rating_dataFrame, movie_dataFrame, on = 'movieId')
overall_movie_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [23]:
columns = ['timestamp', 'genres']
overall_movie_rating = overall_movie_rating.drop(columns, axis = 1)
overall_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [24]:
overall_movie_rating['title'].isnull().sum()
overall_ratingCount = (overall_movie_rating.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'}))
print(overall_ratingCount.shape)
overall_ratingCount.head()

(9719, 2)


Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [25]:
rating_with_totalRatingCount = overall_movie_rating.merge(overall_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
print(rating_with_totalRatingCount.shape)
rating_with_totalRatingCount.head(20)

(100836, 5)


Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
5,18,1,3.5,Toy Story (1995),215
6,19,1,4.0,Toy Story (1995),215
7,21,1,3.5,Toy Story (1995),215
8,27,1,3.0,Toy Story (1995),215
9,31,1,5.0,Toy Story (1995),215


## Remove Duplicate Records

In [26]:
user_rating = rating_with_totalRatingCount.drop_duplicates(['userId','title'])
user_rating.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


## Matrix Factorization using SVD

In [27]:
#### Create matrix of the user_rating data frame 
movie_user_rating_pivot = user_rating.pivot(index = 'userId', columns = 'title', values = 'rating')
movie_user_rating_pivot = movie_user_rating_pivot.fillna(0)
movie_user_rating_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
## Transpose the above  matrix so that the column (movies) becomes rows(userId) and the userId comes to the column
X = movie_user_rating_pivot.T

## Fit the Model

In [29]:
## Fit the model on using X data
## so we will import sckit learn


SVD = TruncatedSVD(n_components=17, random_state=17)
matrix = SVD.fit_transform(X)
print(SVD.explained_variance_ratio_.sum()*100)
matrix.shape

40.45883238627482


(9719, 17)

## Pearson’s R correlation

In [30]:
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(9719, 9719)

## Testing

In [31]:
movie_name = input()
all_movies_name = movie_user_rating_pivot.columns
movieList = list(all_movies_name)
idx = process.extractOne(movie_name, movie_dataFrame['title'])[0]
movie_index = movieList.index(idx)
print(movie_index)

hulk
4112


In [32]:
myPrediction = corr[movie_index]
finalPrediction = list(all_movies_name[(myPrediction >= 0.9)])
print('Users who watched ' + idx + ' also watched')
for movies in finalPrediction:
    print(movies)

Users who watched Hulk (2003) also watched
Hulk (2003)
Serenity (2005)
Star Trek: Nemesis (2002)
