## Content Based Recommender System

In [123]:
import numpy as np
import pandas as pd

Loading u.data -> A dataset comprising user id, movie id, rating and timestamp

In [124]:
column_names= ['user id', 'movie id', 'rating', 'timestamp']
u_data = pd.read_csv('u.data', sep='\t',header=None,names=column_names)
print(len(u_data))
u_data.head()

100000


Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Loading u.item -> Dataset comprising movie id, movie title, release date, IMDb URL and 19 fields of genre (1 indicates the movie is of that genre, a 0 indicates it is not)

In [125]:
c = 'movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = c.split(' | ')
column_names2

['movie id',
 'movie title',
 'release date',
 'video release date',
 'IMDb URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [126]:
data_items = pd.read_csv('u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
data_items

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Merging u.data and u.items

In [127]:
merged_data = pd.merge(u_data, data_items[['movie id', 'movie title']], how='left', left_on='movie id', right_on='movie id')
print(len(merged_data))
print(merged_data)

100000
       user id  movie id  rating  timestamp                   movie title
0          196       242       3  881250949                  Kolya (1996)
1          186       302       3  891717742      L.A. Confidential (1997)
2           22       377       1  878887116           Heavyweights (1994)
3          244        51       2  880606923    Legends of the Fall (1994)
4          166       346       1  886397596           Jackie Brown (1997)
...        ...       ...     ...        ...                           ...
99995      880       476       3  880175444  First Wives Club, The (1996)
99996      716       204       5  879795543     Back to the Future (1985)
99997      276      1090       1  874795795                 Sliver (1993)
99998       13       225       2  882399156         101 Dalmatians (1996)
99999       12       203       3  879959583             Unforgiven (1992)

[100000 rows x 5 columns]


There is an issue with this dataset that for the same set of user id and movie id, ratings can be different at different timestamps. Example of such duplicates are shown below:-

In [128]:
duplicates = merged_data[merged_data.duplicated(['user id', 'movie title', 'rating'], keep=False)]
duplicates

Unnamed: 0,user id,movie id,rating,timestamp,movie title
157,99,268,3,885678247,Chasing Amy (1997)
493,269,246,5,891457067,Chasing Amy (1997)
501,299,303,3,877618584,Ulee's Gold (1997)
553,230,680,4,880484286,Kull the Conqueror (1997)
776,49,1003,2,888068651,That Darn Cat! (1997)
...,...,...,...,...,...
99179,880,268,5,892958128,Chasing Amy (1997)
99292,919,297,4,875288749,Ulee's Gold (1997)
99418,655,305,4,887523909,"Ice Storm, The (1997)"
99721,451,876,4,879012431,Money Talks (1997)


Therefore a dataset is created from the existing merged dataset by grouping the unique user id and movie title combination and the ratings by a user to the same movie in different instances (timestamps) are averaged and stored in the new dataset.

In [129]:
dataset = merged_data.groupby(by=['user id','movie title'], as_index=False).agg({"rating":"mean"})
print(len(dataset))
dataset.head()

99693


Unnamed: 0,user id,movie title,rating
0,1,101 Dalmatians (1996),2.0
1,1,12 Angry Men (1957),5.0
2,1,"20,000 Leagues Under the Sea (1954)",3.0
3,1,2001: A Space Odyssey (1968),4.0
4,1,"Abyss, The (1989)",3.0


In [130]:
avg_ratings = pd.DataFrame(dataset.groupby('movie title')['rating'].mean())
print(avg_ratings.shape)
avg_ratings.head()

(1664, 1)


Unnamed: 0_level_0,rating
movie title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [131]:
avg_ratings['num_of_ratings'] = pd.DataFrame(dataset.groupby('movie title')['rating'].count())
avg_ratings.head()

Unnamed: 0_level_0,rating,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


In [132]:
user_to_movie_dataset = dataset.pivot(
    index='user id',
     columns='movie title',
      values='rating').fillna(0)

user_to_movie_dataset

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
avg_ratings.sort_values(by='num_of_ratings', ascending=False).head()

Unnamed: 0_level_0,rating,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),4.358491,583
Contact (1997),3.803536,509
Fargo (1996),4.155512,508
Return of the Jedi (1983),4.00789,507
Liar Liar (1997),3.156701,485


In [134]:
starwars_ratings = user_to_movie_dataset['Star Wars (1977)']
contact_ratings = user_to_movie_dataset['Contact (1997)']

In [135]:
starwars_ratings

user id
1      5.0
2      5.0
3      0.0
4      5.0
5      4.0
      ... 
939    0.0
940    4.0
941    0.0
942    5.0
943    4.0
Name: Star Wars (1977), Length: 943, dtype: float64

In [136]:
contact_ratings

user id
1      5.0
2      3.0
3      2.0
4      5.0
5      0.0
      ... 
939    4.0
940    5.0
941    4.0
942    4.0
943    0.0
Name: Contact (1997), Length: 943, dtype: float64

In [137]:
similar_to_starwars = user_to_movie_dataset.corrwith(starwars_ratings)
print(similar_to_starwars.shape)
similar_to_starwars.head()

(1664,)


movie title
'Til There Was You (1997)   -0.008917
1-900 (1994)                -0.009002
101 Dalmatians (1996)        0.155780
12 Angry Men (1957)          0.213142
187 (1997)                  -0.034590
dtype: float64

In [138]:
similar_to_contact = user_to_movie_dataset.corrwith(contact_ratings)
similar_to_contact.head()

movie title
'Til There Was You (1997)   -0.015166
1-900 (1994)                -0.058033
101 Dalmatians (1996)        0.042442
12 Angry Men (1957)         -0.051922
187 (1997)                   0.097278
dtype: float64

In [139]:
correlation_starwars = pd.DataFrame(similar_to_starwars, columns=['Correlation'])

In [140]:
correlation_starwars.sort_values(by='Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation
movie title,Unnamed: 1_level_1
Star Wars (1977),1.0
Return of the Jedi (1983),0.748753
"Empire Strikes Back, The (1980)",0.554549
Raiders of the Lost Ark (1981),0.54192
Indiana Jones and the Last Crusade (1989),0.46869
Toy Story (1995),0.457677
"Terminator, The (1984)",0.438933
Back to the Future (1985),0.420658
Alien (1979),0.413547
"Princess Bride, The (1987)",0.412604


In [141]:
correlation_starwars = correlation_starwars.join(avg_ratings['num_of_ratings'])
correlation_starwars.head()

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),-0.008917,9
1-900 (1994),-0.009002,5
101 Dalmatians (1996),0.15578,109
12 Angry Men (1957),0.213142,125
187 (1997),-0.03459,41


In [142]:
correlation_starwars.sort_values('Correlation',ascending=False).head(20)

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
Return of the Jedi (1983),0.748753,507
"Empire Strikes Back, The (1980)",0.554549,367
Raiders of the Lost Ark (1981),0.54192,420
Indiana Jones and the Last Crusade (1989),0.46869,331
Toy Story (1995),0.457677,452
"Terminator, The (1984)",0.438933,301
Back to the Future (1985),0.420658,350
Alien (1979),0.413547,291
"Princess Bride, The (1987)",0.412604,324


In [143]:
new_corr_starwars = correlation_starwars[correlation_starwars['num_of_ratings'] > 100]
new_corr_starwars.sort_values('Correlation',ascending=False).head(20)

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
Return of the Jedi (1983),0.748753,507
"Empire Strikes Back, The (1980)",0.554549,367
Raiders of the Lost Ark (1981),0.54192,420
Indiana Jones and the Last Crusade (1989),0.46869,331
Toy Story (1995),0.457677,452
"Terminator, The (1984)",0.438933,301
Back to the Future (1985),0.420658,350
Alien (1979),0.413547,291
"Princess Bride, The (1987)",0.412604,324


In [144]:
def recommender(movie, min_num_reviews):
    
    movie_ratings = user_to_movie_dataset[movie]
    similar_to_movie = user_to_movie_dataset.corrwith(movie_ratings)
    correlation_movie = pd.DataFrame(similar_to_movie, columns=['Correlation'])
    correlation_movie.dropna(inplace=True)
    correlation_movie = correlation_movie.join(avg_ratings['num_of_ratings'])
    correlation_movie = correlation_movie[correlation_movie['num_of_ratings'] >= min_num_reviews]
    return correlation_movie.sort_values('Correlation',ascending=False).head(20)


In [89]:
recommender('Star Wars (1977)', 100)

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
Return of the Jedi (1983),0.748753,507
"Empire Strikes Back, The (1980)",0.554549,367
Raiders of the Lost Ark (1981),0.54192,420
Indiana Jones and the Last Crusade (1989),0.46869,331
Toy Story (1995),0.457677,452
"Terminator, The (1984)",0.438933,301
Back to the Future (1985),0.420658,350
Alien (1979),0.413547,291
"Princess Bride, The (1987)",0.412604,324


In [145]:
recommender('Terminator 2: Judgment Day (1991)',100)

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Terminator 2: Judgment Day (1991),1.0,295
"Terminator, The (1984)",0.723569,301
True Lies (1994),0.598845,208
Raiders of the Lost Ark (1981),0.597118,420
Speed (1994),0.591296,230
"Empire Strikes Back, The (1980)",0.590962,367
"Fugitive, The (1993)",0.587891,336
Aliens (1986),0.586749,284
Die Hard (1988),0.578825,243
Alien (1979),0.573028,291


In [146]:
recommender('Die Hard 2 (1990)',100)

Unnamed: 0_level_0,Correlation,num_of_ratings
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
Die Hard 2 (1990),1.0,166
Die Hard: With a Vengeance (1995),0.761821,151
Under Siege (1992),0.652486,124
True Lies (1994),0.632153,208
Clear and Present Danger (1994),0.607591,179
Batman (1989),0.598589,201
Top Gun (1986),0.583404,220
Stargate (1994),0.571364,127
Star Trek III: The Search for Spock (1984),0.559006,171
Star Trek IV: The Voyage Home (1986),0.544955,199
