In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
movies_df=pd.read_csv(r'C:\Users\HP PC\Downloads\ml-latest\movies.csv')
rating_df=pd.read_csv(r'C:\Users\HP PC\Downloads\ml-latest\ratings.csv')


In [None]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [None]:
movies_df.shape

(34208, 3)

In [None]:
rating_df.shape

(22884377, 4)

In [None]:
#In rating_df, we don't need timestamp column for our analysis, so we'll remove it
del rating_df['timestamp']

In recommendation system, to recommend user; we need input of his past experiences.
In our system, we'll ask user about the movies he had already watched and his rating for that movie

In [None]:
print('Enter latest 5 movies that you have watched along with rating:')
print('Remember: First letter of each movie must be capital. If movie has \'The\' keyword write like \'Toy Story, The\'')
movie=[]
rating=[]
for i in range(5):
    print('movie:')
    movie.append(str(input()))
    print('rating:')
    rating.append(input())

user_input={'title':movie,'rating':rating}

Enter latest 5 movies that you have watched along with rating:
Remember: First letter of each movie must be capital. If movie has 'The' keyword write like 'Toy Story, The'
movie:
Junior
rating:
3
movie:
Kiss of Death
rating:
4
movie:
Little Women
rating:
2.5
movie:
Fluke
rating:
4.5
movie:
My Family
rating:
5


In [None]:
user_input_df=pd.DataFrame(user_input)
user_input_df

Unnamed: 0,title,rating
0,Junior,3.0
1,Kiss of Death,4.0
2,Little Women,2.5
3,Fluke,4.5
4,My Family,5.0


In [None]:
#In movies_df column year is associated with title, it's better to seperate them
movies_df['year']=movies_df['title'].str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year']=movies_df['year'].str.extract('(\d\d\d\d)',expand=False)
movies_df['title']=movies_df['title'].str.replace('(\(\d\d\d\d\))',' ')
movies_df['title']=movies_df['title'].apply(lambda x: x.strip())

In [None]:
rating_df.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [None]:
user_input_df.dtypes

title     object
rating    object
dtype: object

In [None]:
user_input_df['rating']=user_input_df['rating'].astype(float)

### Collaborative Filtering

In this type of recommendation system, movie is recommended to the user based on other similar users.
That's why it is also called user-user recommendation system

The process for creating a User Based recommendation system is as follows:
- Select a user with the movies the user has watched
- Based on his rating to movies, find the top X neighbours 
- Get the watched movie record of the user for each neighbour.
- Calculate a similarity score using some formula
- Recommend the items with the highest score


In [None]:
input_movies=movies_df.loc[movies_df['title'].isin(user_input_df['title'].tolist()),['title','movieId']]
input_movies.sort_values(by='movieId',axis=0, inplace=True)
input_movies=input_movies[0:5]
input_movies=pd.merge(input_movies,user_input_df)
input_movies

Unnamed: 0,title,movieId,rating
0,Fluke,241,4.5
1,Junior,256,3.0
2,Kiss of Death,259,4.0
3,Little Women,261,2.5
4,My Family,279,5.0


In [None]:
user_subset=rating_df[rating_df['movieId'].isin(input_movies['movieId'].tolist())]
user_subset.head()

Unnamed: 0,userId,movieId,rating
1315,17,241,1.0
1322,17,261,3.0
4735,40,261,5.0
7900,77,261,4.0
8397,92,261,4.0


In [None]:
user_subset_group=user_subset.groupby(['userId'])

In [None]:
user_subset_group=sorted(user_subset_group,key=lambda x:len(x[1]), reverse=True)

In [None]:
user_subset_group[0:3]

[(3937,         userId  movieId  rating
  360522    3937      241     4.0
  360527    3937      256     4.0
  360529    3937      259     3.0
  360531    3937      261     2.0
  360537    3937      279     4.0), (6407,         userId  movieId  rating
  598292    6407      241     3.0
  598301    6407      256     3.0
  598303    6407      259     3.0
  598305    6407      261     3.0
  598316    6407      279     3.0), (6530,         userId  movieId  rating
  611581    6530      241     2.5
  611592    6530      256     3.0
  611595    6530      259     3.0
  611597    6530      261     3.0
  611607    6530      279     3.5)]

In [None]:
#Just for the sake of time saving we'll find similarity score of the first 100 users

In [None]:
user_subset_group=user_subset_group[0:100]

In [None]:
#For each user in above user_subset_group, we'll found similarity score for each group
#The similarity score will be stored in a dictionary where key corresponds to a userId

We'll calculate similarity score by pearson correlation function.
![2b9c2079a3ffc1aacd36201ea0a3fb2460dc226f.svg](attachment:2b9c2079a3ffc1aacd36201ea0a3fb2460dc226f.svg)


n=sample size 
xi, yi are individual sample points indexed with i
x_bar,y_bar are the mean of x and y in sample

In [None]:
from math import pow,sqrt
pearson_corr_dict={}
for name,group in user_subset_group:
    n=len(group)
    X=group.sort_values(by='movieId',ascending=True)['rating'].values
    Y=input_movies[input_movies['movieId'].isin(group['movieId'].tolist())]
    Y=Y.sort_values(by='movieId',ascending=True)['rating'].values
    cov=n*sum(x*y for x,y in zip(X,Y))-sum(x for x in X)*sum(y for y in Y)
    varX=n*sum(x**2 for x in X)-pow(sum(x for x in X),2)
    varY=n*sum(y**2 for y in Y)-pow(sum(y for y in Y),2)
    if varX==0 or varY==0:
        pearson_corr_dict[name]=0
    else:
        pearson_corr_dict[name]=cov/sqrt(varX*varY)

In [None]:
pearson_df=pd.DataFrame.from_dict(pearson_corr_dict,orient='index')
pearson_df.columns=['Similarity_Score']
pearson_df['userId']=pearson_df.index
pearson_df.reset_index(drop=True,inplace=True)
pearson_df.head()

Unnamed: 0,Similarity_Score,userId
0,0.646997,3937
1,0.0,6407
2,0.170499,6530
3,-0.940064,27164
4,-0.646997,33400


In [None]:
pearson_df.sort_values(by='Similarity_Score',ascending=False,inplace=True)
pearson_df.head()

Unnamed: 0,Similarity_Score,userId
84,0.981336,113871
48,0.948683,31144
46,0.948683,28205
44,0.911322,23696
71,0.845154,81732


In [None]:
pearson_df.index=range(len(pearson_df))

In [None]:
pearson_df.head()

Unnamed: 0,Similarity_Score,userId
0,0.981336,113871
1,0.948683,31144
2,0.948683,28205
3,0.911322,23696
4,0.845154,81732


In [None]:
pearson_df.shape

(100, 2)

In [None]:
#Now we need ratings given by selected users to the movies they had watched

In [None]:
top_user_ratings=pearson_df.merge(rating_df,how='inner')

In [None]:
top_user_ratings

Unnamed: 0,Similarity_Score,userId,movieId,rating
0,0.981336,113871,2,4.0
1,0.981336,113871,6,4.0
2,0.981336,113871,8,3.0
3,0.981336,113871,10,3.0
4,0.981336,113871,11,5.0
...,...,...,...,...
82216,-0.948683,95296,4277,4.0
82217,-0.948683,95296,4310,4.0
82218,-0.948683,95296,4369,4.0
82219,-0.948683,95296,4370,5.0


In [None]:
top_user_ratings['weighted rating']=top_user_ratings['Similarity_Score']*top_user_ratings['rating']

In [None]:
top_movies=top_user_ratings[['movieId','weighted rating']].groupby(by='movieId').sum()

In [None]:
top_movies=top_movies.sort_values(by='weighted rating',ascending=False)

In [None]:
#Let we have to prescribe top 20 movies to the user
top_movies=top_movies[0:20]
top_movies.reset_index(inplace=True)

In [None]:
top_movies=top_movies.merge(movies_df[['movieId','title']])

In [None]:
del top_movies['weighted rating']

In [None]:
top_movies

Unnamed: 0,movieId,title
0,279,My Family
1,477,What's Love Got to Do with It?
2,257,Just Cause
3,317,"Santa Clause, The"
4,354,Cobb
5,32,Twelve Monkeys (a.k.a. 12 Monkeys)
6,222,Circle of Friends
7,380,True Lies
8,344,Ace Ventura: Pet Detective
9,338,Virtuosity


### Content Based Filtering

This type of filtering is also called item-item filtering. In this type of filtering, item is recommended by attributes of the item.
The attribute in this case is gernes.

In [None]:
movies_df['genres']=movies_df['genres'].str.split('|')

In [None]:
for index,row in movies_df.iterrows():
    for genre in row['genres']:
        movies_df.at[index,genre]=1
movies_df=movies_df.fillna(0)       
movies_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
del movies_df['genres']

In [None]:
input_movies

Unnamed: 0,title,movieId,rating
0,Fluke,241,4.5
1,Junior,256,3.0
2,Kiss of Death,259,4.0
3,Little Women,261,2.5
4,My Family,279,5.0


In [None]:
user_movies=movies_df[movies_df['movieId'].isin(input_movies['movieId'].tolist())]
user_movies

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
238,241,Fluke,1995,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
253,256,Junior,1994,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
256,259,Kiss of Death,1995,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
258,261,Little Women,1994,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276,279,My Family,1995,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_genres=user_movies.iloc[:,3:]
user_genres.reset_index(inplace=True,drop=True)

In [None]:
user_profile=user_genres.transpose().dot(input_movies['rating'])

In [None]:
user_profile

Adventure              0.0
Animation              0.0
Children               4.5
Comedy                 3.0
Fantasy                0.0
Romance                0.0
Drama                 16.0
Action                 0.0
Crime                  4.0
Thriller               4.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 3.0
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

Now we'll calculate a score for each movie by multiplying above values with each genres and adding them

In [None]:
genre_table=movies_df.copy()
del genre_table['title']
del genre_table['year']
genre_table.set_index('movieId',inplace=True)



In [None]:
recommendation_table=((genre_table*user_profile).sum(axis=1)/user_profile.sum())
recommendation_table=recommendation_table.sort_values(ascending=False)
recommendation_table.shape

(34208,)

In [None]:
movies=movies_df.loc[movies_df['movieId'].isin(recommendation_table.index[0:20]),['movieId','title']]

In [None]:
movies

Unnamed: 0,movieId,title
4625,4719,Osmosis Jones
4923,5018,Motorama
9226,27171,Freeway II: Confessions of a Trickbaby
9383,27549,Dead or Alive: Final
9428,27674,11:14
9494,27790,Millions
10406,36804,Bookies
15001,75408,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...
16055,81132,Rubber
18300,91286,"Little Colonel, The"


### Highest rated movies

In [None]:
highest_rated=rating_df[['movieId','rating']].groupby(by='movieId').mean()
highest_rated.sort_values(by='rating',ascending=False)
highest_rated.reset_index(inplace=True)
highest_rated=highest_rated.merge(movies_df[['movieId','title']])
highest_rated.sort_values(by='rating',inplace=True,ascending=False)

In [None]:
highest_rated.head(20)

Unnamed: 0,movieId,rating,title
30653,140443,5.0,Return to Mayberry
29783,137078,5.0,Stranger in My House
32405,146433,5.0,Big Love
29774,137052,5.0,A Job to Kill For
29773,137050,5.0,The Rival
29772,137048,5.0,Perfect Child
29771,137046,5.0,The Perfect Assistant
22159,106517,5.0,De la servitude moderne
29768,137040,5.0,Hunger Point
32443,146650,5.0,The C-Word
