## In this Project we create a Content-Based recommendation system for some data from IMDB cite to give the best option for users to watch new movies

#### First we import required libraries:

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### reading movies and ratings datasets

In [82]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

### Show the size of our datasets:

In [83]:
print(movies_df.shape)
print(ratings_df.shape)

(9742, 3)
(100836, 4)


### Show the head of datasets:

In [84]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [85]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Modify the movies_df data set and split the year of movie:

In [86]:
# در این قسمت ستون‌ سال را از اسم جدا میکنیم
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


### Modify the movies_df dataset and split the genres of movies and make a copy of dataset:

In [87]:
# in this section we split the genres column by <.split()> code by '|' icon and convert that column to a list
movies_df['genres'] = movies_df.genres.str.split('|')
moviesG_df = movies_df.copy()
moviesG_df


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017
9738,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017
9739,193585,Flint,[Drama],2017
9740,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018


### in the copy of dataset, we create new columns based on genres of the movie. it shows category of the movie by 1 and 0s

In [88]:
# for every genre in genres column list, we make a new column by that genre name and fill it with 1 if true elif false: 0
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesG_df.at[index, genre] = 1


#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesG_df = moviesG_df.fillna(0)
moviesG_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### we need to clean our rating dataset by columns we don't need like time

In [90]:
# we delete timestamp column because it's useless for our job by <.drop()> code
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

  ratings_df = ratings_df.drop('timestamp', 1)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Now we have our movie and rating dataset clean!
Now it's time to get new_user input:

In [91]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
             {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [92]:
# userInput = [
#             {'title':'Grumpier Old Men', 'rating':5},
#             {'title':'Flint', 'rating':2.5},
#             {'title':'Jumanji', 'rating':2},
#              {'title':"Andrew Dice Clay: Dice Rules", 'rating':5},
#             {'title':'Father of the Bride Part II', 'rating':4.5}
#          ] 
# inputMovies = pd.DataFrame(userInput)
# inputMovies

## To create a Recommendation system based on user content we have three steps:
1.What are the genres of the movies that the new_user rated!
to answer this question we need to build User_Movie_matrix

2.. Find the favorite genres of the User and create the Weighted_movies_rating_matrix

3.Compare the Weighted_movies_rating_matrix of the user with genreTable and return the most similarity

## First step: Create User Movie Matrix:

In [93]:
# we get the shape of moviesG_df dataframe
moviesG_df.shape

(9742, 24)

In [94]:
#Add inputid to inputMovies dataframe by finding similar title in both dataframse by <.isin()> code
inputId = moviesG_df[moviesG_df['title'].isin(inputMovies['title'])]
inputId


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
257,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
973,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1445,1968,"Breakfast Club, The","[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
# we merge inputMovies and inputId dataframes by <pd.merge()> code and delete genre and year columns
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('genres', 1).drop('year', 1)

inputMovies



  inputMovies = inputMovies.drop('genres', 1).drop('year', 1)
  inputMovies = inputMovies.drop('genres', 1).drop('year', 1)


Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),rating
0,1,Toy Story,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
1,2,Jumanji,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,296,Pulp Fiction,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1274,Akira,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
4,1968,"Breakfast Club, The",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


### Create User Genre Matrix:

In [96]:
# we create user genre matrix by droping every column in inputMovies dataframse but the ones contain genres and call it userGenreTable:
userGenreTable = inputMovies.drop('movieId', 1).drop('title', 1).drop('rating' , 1)
userGenreTable

  userGenreTable = inputMovies.drop('movieId', 1).drop('title', 1).drop('rating' , 1)
  userGenreTable = inputMovies.drop('movieId', 1).drop('title', 1).drop('rating' , 1)
  userGenreTable = inputMovies.drop('movieId', 1).drop('title', 1).drop('rating' , 1)


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
inputMovies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

## Second Step: Create Weighted User Rating Matrix 
by multiplying userGenreTable to userRating

In [98]:
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
print(inputMovies['rating'].shape)
print(userGenreTable.shape)

(5,)
(5, 20)


In [100]:
inputMovies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

In [101]:
# for creating user weighted genre matrix (userProfile) we multiply user rating to user Genre Table by <.dot()> and then normlize it

userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
userProfile


Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [102]:
# and then normlize it
userProfile = userProfile/sum(userProfile)
userProfile

Adventure             0.139860
Animation             0.111888
Children              0.076923
Comedy                0.188811
Fantasy               0.076923
Romance               0.000000
Drama                 0.139860
Action                0.062937
Crime                 0.069930
Thriller              0.069930
Horror                0.000000
Mystery               0.000000
Sci-Fi                0.062937
War                   0.000000
Musical               0.000000
Documentary           0.000000
IMAX                  0.000000
Western               0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
dtype: float64

Now, we have the weights for every of the user's preferences. This is known as the User Profile. Using this, we can recommend movies that satisfy the user's preferences.


## Third step: Comparing the User Profile to GenreTable of the whole movies!

In [103]:
### Whole Movies GenreTable:
# we don't want to lose movie_id but in multiplying the userProfile in genreTable we need to clear every column but the ones contain genres.
# so we set movie_ids as index to keep them by <.set_index(moviesG_df['movieId'])> code
genreTable = moviesG_df.set_index(moviesG_df['movieId'])

#then we drop every column but the ones contain genres:

genreTable = genreTable.drop('movieId' , 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.tail()

  genreTable = genreTable.drop('movieId' , 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId' , 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId' , 1).drop('title', 1).drop('genres', 1).drop('year', 1)
  genreTable = genreTable.drop('movieId' , 1).drop('title', 1).drop('genres', 1).drop('year', 1)


Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
193581,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193609,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Now Compare this to UserProfile and return the most similarities:

In [104]:
# we create a dataframe based on the weighted ratings of every movies by matrix multiplying of the genreTable and userProfile:. 
# the indicies are movie_ids

recommendationTable_df = (genreTable*userProfile)

#we create a column named similarity that shows the similarity og the user favorite and movie's genre:

recommendationTable_df['similarity'] =  recommendationTable_df.sum(axis=1)
recommendationTable_df

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),similarity
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.13986,0.111888,0.076923,0.188811,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594406
2,0.13986,0.000000,0.076923,0.000000,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293706
3,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
4,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.13986,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328671
5,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.062937,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440559
193583,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377622
193585,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.13986,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139860
193587,0.00000,0.111888,0.000000,0.000000,0.000000,0.0,0.00000,0.062937,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174825


In [105]:
# here we create a column named movieId based on index of the dataframse. remember that indicies were based on movieIds

recommendationTable_df['movieId' ] = recommendationTable_df.index
recommendationTable_df

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),similarity,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.13986,0.111888,0.076923,0.188811,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594406,1
2,0.13986,0.000000,0.076923,0.000000,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293706,2
3,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811,3
4,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.13986,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328671,4
5,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.062937,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440559,193581
193583,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377622,193583
193585,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.13986,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139860,193585
193587,0.00000,0.111888,0.000000,0.000000,0.000000,0.0,0.00000,0.062937,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174825,193587


In [106]:
# we move the movieId to first column here:

first_column = recommendationTable_df.pop('movieId')
  
# insert column using insert(position,column_name,
# first_column) function
recommendationTable_df.insert(0, 'movieId', first_column)
recommendationTable_df

Unnamed: 0_level_0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),similarity
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.13986,0.111888,0.076923,0.188811,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594406
2,2,0.13986,0.000000,0.076923,0.000000,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293706
3,3,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
4,4,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.13986,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328671
5,5,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,193581,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.062937,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440559
193583,193583,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377622
193585,193585,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.13986,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139860
193587,193587,0.00000,0.111888,0.000000,0.000000,0.000000,0.0,0.00000,0.062937,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174825


In [107]:
# and we reset the indicies by <.reset_index(drop=True)>

recommendationTable_df = recommendationTable_df.reset_index(drop=True)
recommendationTable_df

Unnamed: 0,movieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),similarity
0,1,0.13986,0.111888,0.076923,0.188811,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594406
1,2,0.13986,0.000000,0.076923,0.000000,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293706
2,3,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
3,4,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.13986,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328671
4,5,0.00000,0.000000,0.000000,0.188811,0.000000,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.062937,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440559
9738,193583,0.00000,0.111888,0.000000,0.188811,0.076923,0.0,0.00000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377622
9739,193585,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.13986,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139860
9740,193587,0.00000,0.111888,0.000000,0.000000,0.000000,0.0,0.00000,0.062937,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174825


### Now that we have the movieIds, we can return the recommendation Table by Movie's names:

In [108]:
#fisrt we merge the recommendationTable dataframe that we created above with movies dataframe that we imported:
Recom = pd.merge(movies_df , recommendationTable_df)

#we drop the columns that contains genres for cleaning the final Recommendation Table. columnt 4 to 23 are for genres:
cols = []
for i in range(24):
    # print(i)
    if i > 3:
        cols.append(i)
# print(cols)
Recom = Recom.drop(Recom.columns[cols],axis=1)

# and finally we sort Recommendation Table by similarity by <.sort_values()> code:  
RecommendationTable_df = Recom.sort_values(by='similarity',ascending=False)
RecommendationTable_df = RecommendationTable_df.reset_index(drop=True)
RecommendationTable_df

Unnamed: 0,movieId,title,genres,year,similarity
0,134853,Inside Out,"[Adventure, Animation, Children, Comedy, Drama...",2015,0.734266
1,148775,Wizards of Waverly Place: The Movie,"[Adventure, Children, Comedy, Drama, Fantasy, ...",2009,0.685315
2,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000,0.678322
3,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002,0.678322
4,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010,0.671329
...,...,...,...,...,...
9737,6299,"Winged Migration (Peuple migrateur, Le)",[Documentary],2001,0.000000
9738,1797,Everest,"[Documentary, IMAX]",1998,0.000000
9739,6290,House of 1000 Corpses,[Horror],2003,0.000000
9740,6289,Ghosts of the Abyss,"[Documentary, IMAX]",2003,0.000000


## متوجه شدم که در کد اصلی که در درس تدریس شده قسمت آخر کد مشکل دارد و جدول به دست آمده بر اساس شباهت نیست.
## پس من جدولی ساختم که در آن درصد شباهت را در ستون آخر آن نشان میدهد 

# Collaborative Filtering!

### here we Don't care about genres. we want to check for weighted average score for a user based on other's user similarity

### <img src="https://editor.analyticsvidhya.com/uploads/460031_9XZYM6B5Ly-ENYTkEtr9dA.png" width=800px>


In [109]:
inputMovies

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),rating
0,1,Toy Story,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
1,2,Jumanji,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,296,Pulp Fiction,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1274,Akira,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
4,1968,"Breakfast Club, The",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [110]:
# so we drop the genres items in inputMovies dataframe:
cols = []
for i in range(22):
    # print(i)
    if i > 1:
        cols.append(i)
# drop columns number in cols list by <.drop()>
inputMovies = inputMovies.drop(inputMovies.columns[cols],axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [116]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


### there are 610 users according to ratings dataframe. we have to find similarity between mutal likes by rating movies between these users and our user. so first we create a sub dataset of ratings_df that have rated the movies in our inputMovies dataset!

In [119]:
# find the movie_ids that are same in inputMovies and ratings_df dataframse by <.isin()> code
# movie_ids = [1 , 2 , 296 , 1274 , 1968]
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
...,...,...,...
99510,609,296,4.0
99534,610,1,5.0
99552,610,296,5.0
99636,610,1274,5.0


In [137]:
# now we group the dataframe by userId: <.groupby()>
userSubsetGroup = userSubset.groupby(['userId'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fcaf3292ef0>

In [131]:
#it shows similarity between user 1 and input_user:
userSubsetGroup.get_group(1)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0


In [139]:
#we can sort our userSetGroup by the users that have the most in common movies with input_user.
#this can be done by sort by the len of the similarity:
# it return the tupple of list that the first item is user id and the second is the movies in common and the ratings:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup


[(91,
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 (177,
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 (219,
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0),
 (274,
         userId  movieId  rating
  39229     274        1     4.0
  39230     274        2     3.5
  39288     274      296     5.0
  39448     274     1274     4.0
  39549     274     1968     4.0),
 (298,
         userId  movieId  rating
  44535     298        1     2.0
  44536     298        2     0.5
  44555     298      296     4.5
  44620     298     1274     4.0
 

### fot instance we can witness that user 91 has 5 movies in common with user_input and user 601 has only 1

In [143]:
print('number of users with similar movies rated: ' , len(userSubsetGroup))

number of users with similar movies rated:  419


In [144]:
#we only work with the first 50 users:
userSubsetGroup = userSubsetGroup[0:50]

### we have to find the similarity coefficient between users and input_user. for that we work with Pearson Correlation

### Now, we calculate the Pearson Correlation between input user and subset group, and store it in a dictionary, where the key is the user Id and the value is the coefficient.


In [197]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
from math import sqrt
pearsonCorrelationDict = {}

#For any user group in our userSubsetGroup
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'])]#.tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating']#.tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating']#.tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    
    if Sxx != 0 and Syy != 0:
        x = Sxy/sqrt(Sxx*Syy)
        pearsonCorrelationDict[name] = float(format(x, '.3f'))

    else:
        pearsonCorrelationDict[name] = 0


In [198]:
# now we sort the dictionary with the key (coefficient)
sorted_pearsonCorrelationDict = dict(sorted(pearsonCorrelationDict.items(), key=lambda x:x[1],reverse=True))
sorted_pearsonCorrelationDict

{18: 1.0,
 132: 1.0,
 144: 1.0,
 434: 0.986,
 307: 0.966,
 298: 0.959,
 226: 0.944,
 182: 0.943,
 414: 0.938,
 608: 0.921,
 606: 0.915,
 330: 0.904,
 135: 0.87,
 122: 0.866,
 153: 0.866,
 160: 0.866,
 469: 0.816,
 480: 0.784,
 599: 0.767,
 274: 0.716,
 288: 0.601,
 357: 0.561,
 103: 0.522,
 202: 0.522,
 561: 0.522,
 322: 0.506,
 45: 0.5,
 66: 0.5,
 140: 0.5,
 219: 0.451,
 318: 0.445,
 91: 0.439,
 477: 0.439,
 217: 0.302,
 448: 0.302,
 156: 0.189,
 600: 0.184,
 50: 0.157,
 474: 0.117,
 483: 0.08,
 177: 0.0,
 68: 0.0,
 21: 0,
 64: 0.0,
 141: 0,
 610: -0.471,
 19: -0.5,
 63: -0.5,
 57: -0.739,
 107: -1.0}

In [199]:
#convert sorted_pearsonCorrelation Dictionary to a dataframe by pandas <.from_dict()> code
pearsonDF = pd.DataFrame.from_dict(sorted_pearsonCorrelationDict, orient='index')
pearsonDF

Unnamed: 0,0
18,1.0
132,1.0
144,1.0
434,0.986
307,0.966
298,0.959
226,0.944
182,0.943
414,0.938
608,0.921


In [200]:
#name columns to similarity
pearsonDF.columns = ['similarity']
#name new column userId by the index of the dataframe
pearsonDF['userId'] = pearsonDF.index
#reset the index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()


Unnamed: 0,similarity,userId
0,1.0,18
1,1.0,132
2,1.0,144
3,0.986,434
4,0.966,307


In [202]:
#we select the first 30 users in common
topUsers = pearsonDF[:30]


In [205]:
topUsers

Unnamed: 0,similarity,userId
0,1.0,18
1,1.0,132
2,1.0,144
3,0.986,434
4,0.966,307
5,0.959,298
6,0.944,226
7,0.943,182
8,0.938,414
9,0.921,608


### we found the users that have the most similarity to our input_user by their rated movies. Now we have to find the movies that these users rated higher to recommend them to our input_user:

In [207]:
#merge topUser with ratings dataframse
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head(5)

Unnamed: 0,similarity,userId,movieId,rating
0,1.0,18,1,3.5
1,1.0,18,2,3.0
2,1.0,18,6,4.0
3,1.0,18,16,4.5
4,1.0,18,32,4.0


### Now all we need to do is simply multiply the movie rating by its weight (The similarity), then sum up the new ratings and divide it by the sum of the weights.

We can easily do this by simply multiplying two columns, then grouping up the dataframe by movieId and then dividing two columns:

It shows the idea of all similar users to candidate movies for the input user:


In [214]:
# multiply similarity to rating and create new column names weightedRating
topUsersRating['weightedRating'] = topUsersRating['similarity']*topUsersRating['rating']
topUsersRating.tail()

Unnamed: 0,similarity,userId,movieId,rating,weightedRating
20516,0.451,219,62394,2.5,1.1275
20517,0.451,219,63082,5.0,2.255
20518,0.451,219,64575,5.0,2.255
20519,0.451,219,64620,4.0,1.804
20520,0.451,219,66297,4.0,1.804


In [225]:
# create a dataframe that collects the sum of similarity and sum of weightedRating
# and divide them to normalize it and return the average score!
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarity','weightedRating']]
tempTopUsersRating.columns = ['sum_similarity','sum_weightedRating']

tempTopUsersRating['weighted average recommendation score'] =  tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarity']

tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarity,sum_weightedRating,weighted average recommendation score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,22.346,77.84,3.483397
2,18.538,51.7935,2.79391
3,6.825,20.5055,3.004469
5,3.583,10.232,2.855708
6,10.311,39.4095,3.822083


In [230]:
# now we create a column for movie_id by the index and reset indicies:
recommendation_df = tempTopUsersRating
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df = recommendation_df.reset_index(drop=True)
recommendation_df

Unnamed: 0,sum_similarity,sum_weightedRating,weighted average recommendation score,movieId
0,22.346,77.8400,3.483397,1
1,18.538,51.7935,2.793910,2
2,6.825,20.5055,3.004469,3
3,3.583,10.2320,2.855708,5
4,10.311,39.4095,3.822083,6
...,...,...,...,...
5654,0.767,2.3010,3.000000,183301
5655,0.866,0.4330,0.500000,184471
5656,0.938,2.3450,2.500000,184791
5657,1.000,4.5000,4.500000,185135


In [231]:
#drop the extra columns:
recommendation_df = recommendation_df.drop('sum_similarity' , 1).drop('sum_weightedRating' , 1)
recommendation_df

  recommendation_df = recommendation_df.drop('sum_similarity' , 1).drop('sum_weightedRating' , 1)
  recommendation_df = recommendation_df.drop('sum_similarity' , 1).drop('sum_weightedRating' , 1)


Unnamed: 0,weighted average recommendation score,movieId
0,3.483397,1
1,2.793910,2
2,3.004469,3
3,2.855708,5
4,3.822083,6
...,...,...
5654,3.000000,183301
5655,0.500000,184471
5656,2.500000,184791
5657,4.500000,185135


In [232]:
recommendation_df

Unnamed: 0,weighted average recommendation score,movieId
0,3.483397,1
1,2.793910,2
2,3.004469,3
3,2.855708,5
4,3.822083,6
...,...,...
5654,3.000000,183301
5655,0.500000,184471
5656,2.500000,184791
5657,4.500000,185135


In [237]:
# now we sort rec1ommendation dataframe by weighted average recommendation score
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0,weighted average recommendation score,movieId
302,5.0,456
1354,5.0,2314
5483,5.0,134881
2229,5.0,3925
2583,5.0,4634


In [248]:
# now we merge recommendation_df to movies_df to get the title and other details of the movies
Recom = pd.merge(movies_df , recommendation_df)
Recom

Unnamed: 0,movieId,title,genres,year,weighted average recommendation score
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,3.483397
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,2.793910
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,3.004469
3,5,Father of the Bride Part II,[Comedy],1995,2.855708
4,6,Heat,"[Action, Crime, Thriller]",1995,3.822083
...,...,...,...,...,...
5654,183301,The Tale of the Bunny Picnic,[Children],1986,3.000000
5655,184471,Tomb Raider,"[Action, Adventure, Fantasy]",2018,0.500000
5656,184791,Fred Armisen: Standup for Drummers,[Comedy],2018,2.500000
5657,185135,Sherlock - A Study in Pink,[Crime],2010,4.500000


In [249]:
# and finally we sort it by the weighted score
Recom = Recom.sort_values(by='weighted average recommendation score', ascending=False)
#and reset the indicies
Recom = Recom.reset_index(drop=True)

Recom


Unnamed: 0,movieId,title,genres,year,weighted average recommendation score
0,456,Fresh,"[Crime, Drama, Thriller]",1994,5.0
1,2314,Beloved,[Drama],1998,5.0
2,85,Angels and Insects,"[Drama, Romance]",1995,5.0
3,38304,No Direction Home: Bob Dylan,[Documentary],2005,5.0
4,5867,Thief,"[Crime, Drama, Thriller]",1981,5.0
...,...,...,...,...,...
5654,86320,Melancholia,"[Drama, Sci-Fi]",2011,0.5
5655,87485,Bad Teacher,[Comedy],2011,0.5
5656,149354,Sisters,"[Children, Comedy]",2015,0.5
5657,149352,Daddy's Home,[Comedy],2015,0.5


## متوجه شدم که در کد اصلی که در درس تدریس شده قسمت آخر کد مشکل دارد و جدول به دست آمده بر اساس میانگین امتیاز وزن دار شده نیست.
## پس من جدولی ساختم که در آن میانگین امتیاز وزن دار شده  را در ستون آخر آن نشان میدهد 

# Thanks for your wonderful course
# Cheers!