# Building a Movie Recommendation Software Using ML

In [1]:
#importing Libraries
import numpy as np
import pandas as pd
import warnings

In [2]:
#ignores unecessary warnings
warnings.filterwarnings('ignore')

In [3]:
#saving first dataset in df and giving column names
columns_name=['user_id','item_id','rating','timestamp']
df=pd.read_csv('u.data',sep="\t",names=columns_name)

In [5]:
#prints the first 5 values of the dataset
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
#gives row and column size of the dataframe
df.shape

(100000, 4)

In [8]:
#displays all the user_ids
df['user_id']


0        196
1        186
2         22
3        244
4        166
        ... 
99995    880
99996    716
99997    276
99998     13
99999     12
Name: user_id, Length: 100000, dtype: int64

In [9]:
#shows unique numbers of user id
df['user_id'].nunique()

943

In [10]:
#shows unique numbers of movies
df['item_id'].nunique()

1682

In [4]:
#saving 2nd dataset in movie_title
movies_title=pd.read_csv('u.item',sep="\|",header=None,encoding = "ISO-8859-1")

In [13]:
#gives row and column size of movie_title
movies_title.shape

(1682, 24)

In [14]:
#prints the first 5 values of the dataset
movies_title.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
#creates a dataframe movies_titles containing the first 2 colums of movies_title and naming them item_id and title,then printing the first 5 values
movies_titles=movies_title[[0,1]]
movies_titles.columns=["item_id","title"]
movies_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
#Merging both datasets using the common column name item_id by using merge function
df=pd.merge(df,movies_titles,on="item_id")

In [17]:
#displaying the dataset df which is now merged with movies_title
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [18]:
#tail can be used to display the last 5 enteries
df.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,title
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."
99999,655,1641,3,887427810,Dadetown (1995)


In [7]:
#creating a object called rating and then assigning the value to it by finding average rating of a movie by grouping them together
ratings=pd.DataFrame(df.groupby('title').mean()['rating'])

In [20]:
ratings.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [8]:
#creating a column in rating which has the total count of the people who reviewed the movie
ratings['num of ratings']=pd.DataFrame(df.groupby('title').count()['rating'])

In [22]:
ratings.head()

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41



# Creating the Recommender System

In [9]:
#pivoting the table such that movie names become the columns and all the columns become the row headings using matrix
moviemat=df.pivot_table(index="user_id",columns="title",values="rating")

In [26]:
moviemat.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


#Building Custom Recommendation for Star Wars

#create a star wars object

In [10]:
starwars_user_ratings=moviemat['Star Wars (1977)']

In [11]:
starwars_user_ratings.head(20)

user_id
1     5.0
2     5.0
3     NaN
4     5.0
5     4.0
6     4.0
7     5.0
8     5.0
9     5.0
10    5.0
11    NaN
12    4.0
13    5.0
14    5.0
15    5.0
16    NaN
17    NaN
18    4.0
19    NaN
20    3.0
Name: Star Wars (1977), dtype: float64

In [12]:
#correlating the entire movie dataset with Star Wars .It gives us which movies have correlation with star wars

In [13]:
similar_to_starwars=moviemat.corrwith(starwars_user_ratings)

In [14]:
similar_to_starwars

title
'Til There Was You (1997)                0.872872
1-900 (1994)                            -0.645497
101 Dalmatians (1996)                    0.211132
12 Angry Men (1957)                      0.184289
187 (1997)                               0.027398
                                           ...   
Young Guns II (1990)                     0.228615
Young Poisoner's Handbook, The (1995)   -0.007374
Zeus and Roxanne (1997)                  0.818182
unknown                                  0.723123
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64

In [15]:
corr_starwars=pd.DataFrame(similar_to_starwars,columns=['correlation'])



In [16]:
#removing NaN values
corr_starwars.dropna(inplace=True)

In [17]:
corr_starwars

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398
...,...
Young Guns (1988),0.186377
Young Guns II (1990),0.228615
"Young Poisoner's Handbook, The (1995)",-0.007374
Zeus and Roxanne (1997),0.818182


In [20]:
#sorting based on descending value of correlation
corr_starwars.sort_values('correlation',ascending=False).head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
Man of the Year (1995),1.0
Hollow Reed (1996),1.0
Stripes (1981),1.0
"Beans of Egypt, Maine, The (1994)",1.0
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.0
"Outlaw, The (1943)",1.0
"Line King: Al Hirschfeld, The (1996)",1.0
Hurricane Streets (1998),1.0
"Good Man in Africa, A (1994)",1.0
Safe Passage (1994),1.0


In [21]:
#adding number of ratings column from ratings into corr_starwars 
corr_starwars=corr_starwars.join(ratings['num of ratings'])

In [22]:
corr_starwars.head()

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.872872,9
1-900 (1994),-0.645497,5
101 Dalmatians (1996),0.211132,109
12 Angry Men (1957),0.184289,125
187 (1997),0.027398,41


In [23]:
#giving a threshold of atleast 100 people who have reviewed the movie and sorting the correlation
corr_starwars[corr_starwars['num of ratings']>100].sort_values('correlation',ascending=False)

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.000000,583
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Austin Powers: International Man of Mystery (1997),0.377433,130
...,...,...
"Edge, The (1997)",-0.127167,113
As Good As It Gets (1997),-0.130466,112
Crash (1996),-0.148507,128
G.I. Jane (1997),-0.176734,175


# creating genralized recommendation system

In [24]:
def predict_movies(movie_name):
    movie_user_ratings=moviemat[movie_name]
    similar_to_movie=moviemat.corrwith(movie_user_ratings)
    corr_movie=pd.DataFrame(similar_to_movie,columns=['correlation'])
    corr_movie.dropna(inplace=True)
    corr_movie=corr_movie.join(ratings['num of ratings'])
    
    predictions=corr_movie[corr_movie['num of ratings']>100].sort_values('correlation',ascending=False)
    
    return predictions

In [26]:
predict_my_movie=predict_movies("Titanic (1997)")

In [27]:
predict_my_movie.head()

Unnamed: 0_level_0,correlation,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Titanic (1997),1.0,350
"River Wild, The (1994)",0.4976,146
"Abyss, The (1989)",0.472103,151
Bram Stoker's Dracula (1992),0.44356,120
True Lies (1994),0.435104,208
