In [2]:


import pandas as pd
from scipy import sparse

from sklearn.metrics.pairwise import cosine_similarity


# Reading Data and preprocessing

In [35]:
#reading data

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.merge(movies,ratings).drop(['genres','timestamp'],axis=1)
movie_titles = dict(zip(movies['movieId'], movies['title']))

ratings.head()



Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [34]:
userRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')

userRatings = userRatings.fillna(0,axis=1)#filling NAN values with 0


# Applying Cosine Similarity

In [32]:
#standardizing values of rating
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row 

df_std = userRatings.apply(standardize)
#applying cosine similarity to the data
cosMatrix =cosine_similarity(df_std.T)


In [15]:


item_similarity_df=pd.DataFrame(cosMatrix,index=userRatings.columns,columns=userRatings.columns)
item_similarity_df



title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,-0.001642,-0.002324,-0.001642,-0.002254,-0.001642,-0.006407,-0.001642,0.135943,-0.004325,...,-0.001642,0.339935,0.542247,0.706526,-0.001642,-0.007675,0.134327,0.325287,-0.008185,-0.001642
'Hellboy': The Seeds of Creation (2004),-0.001642,1.000000,0.706526,-0.001642,-0.002254,-0.001642,-0.006407,-0.001642,-0.010568,-0.004325,...,-0.001642,-0.004589,-0.002808,-0.002324,-0.001642,-0.007675,-0.007744,-0.003594,-0.008185,-0.001642
'Round Midnight (1986),-0.002324,0.706526,1.000000,-0.002324,-0.003191,-0.002324,0.170199,-0.002324,-0.014958,-0.006121,...,-0.002324,-0.006495,-0.003975,-0.003289,-0.002324,-0.010863,-0.010961,-0.005087,-0.011585,-0.002324
'Salem's Lot (2004),-0.001642,-0.001642,-0.002324,1.000000,0.857269,-0.001642,-0.006407,-0.001642,-0.010568,-0.004325,...,-0.001642,-0.004589,-0.002808,-0.002324,-0.001642,-0.007675,-0.007744,-0.003594,-0.008185,-0.001642
'Til There Was You (1997),-0.002254,-0.002254,-0.003191,0.857269,1.000000,-0.002254,-0.008797,-0.002254,-0.014510,-0.005938,...,-0.002254,-0.006301,-0.003856,-0.003191,-0.002254,-0.010538,-0.010632,-0.004935,-0.011238,-0.002254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),-0.007675,-0.007675,-0.010863,-0.007675,-0.010538,-0.007675,0.187953,0.212646,0.053614,0.115396,...,-0.007675,-0.021449,-0.013126,-0.010863,-0.007675,1.000000,0.163022,-0.016800,0.138611,-0.007675
xXx (2002),0.134327,-0.007744,-0.010961,-0.007744,-0.010632,-0.007744,0.062174,-0.007744,0.241092,-0.000060,...,0.063291,0.291410,0.163464,0.240394,-0.007744,0.163022,1.000000,0.259049,0.065673,-0.007744
xXx: State of the Union (2005),0.325287,-0.003594,-0.005087,-0.003594,-0.004935,-0.003594,-0.014025,-0.003594,0.139511,-0.009467,...,-0.003594,0.376455,0.172818,0.227658,-0.003594,-0.016800,0.259049,1.000000,-0.017917,-0.003594
¡Three Amigos! (1986),-0.008185,-0.008185,-0.011585,-0.008185,-0.011238,-0.008185,0.353194,0.175610,0.125905,0.234514,...,0.175610,-0.022876,-0.013999,-0.011585,-0.008185,0.138611,0.065673,-0.017917,1.000000,-0.008185


# Get Similar Function 

In [24]:
#getting similar movies through item_similarity_Df which was created above using cosine similarity
def get_similar(movie_name,rating):
    similar_score = item_similarity_df[movie_name]*(rating)
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score


# Example Runs

In [27]:
#creating a dataframe for output and appending in it first movie and the similar ones to it
similar_movies = pd.DataFrame()
similar_movies = similar_movies.append(get_similar("(500) Days of Summer (2009)",5),ignore_index = True)

similar_movies.head(10)
similar_movies.sum().sort_values(ascending=False).head(20)

title
(500) Days of Summer (2009)           5.000000
Silver Linings Playbook (2012)        2.509796
Adventureland (2009)                  2.314041
Up in the Air (2009)                  2.259921
50/50 (2011)                          2.247781
                                        ...   
Stargate (1994)                      -0.285740
Madness of King George, The (1994)   -0.288990
Disclosure (1994)                    -0.321149
Postman, The (Postino, Il) (1994)    -0.322746
Clear and Present Danger (1994)      -0.419620
Name: (500) Days of Summer (2009), Length: 9719, dtype: float64


(500) Days of Summer (2009)                5.000000
Silver Linings Playbook (2012)             2.509796
Adventureland (2009)                       2.314041
Up in the Air (2009)                       2.259921
50/50 (2011)                               2.247781
Descendants, The (2011)                    2.165548
Crazy, Stupid, Love. (2011)                2.153462
About Time (2013)                          2.101287
Toy Story 3 (2010)                         2.100772
Secret Life of Walter Mitty, The (2013)    2.086095
Zodiac (2007)                              2.072927
Scott Pilgrim vs. the World (2010)         2.064043
Alice in Wonderland (2010)                 2.039978
Yes Man (2008)                             2.036678
Holiday, The (2006)                        2.033995
Hangover, The (2009)                       2.030105
Darjeeling Limited, The (2007)             2.013405
I Love You, Man (2009)                     2.002299
Kick-Ass (2010)                            1.995874
Marley & Me 

In [21]:
similar_movies = pd.DataFrame()
similar_movies = similar_movies.append(get_similar("(500) Days of Summer (2009)",5),ignore_index = True)

similar_movies.head(10)
similar_movies.sum().sort_values(ascending=False).head(20)

(500) Days of Summer (2009)                5.000000
Silver Linings Playbook (2012)             2.509796
Adventureland (2009)                       2.314041
Up in the Air (2009)                       2.259921
50/50 (2011)                               2.247781
Descendants, The (2011)                    2.165548
Crazy, Stupid, Love. (2011)                2.153462
About Time (2013)                          2.101287
Toy Story 3 (2010)                         2.100772
Secret Life of Walter Mitty, The (2013)    2.086095
Zodiac (2007)                              2.072927
Scott Pilgrim vs. the World (2010)         2.064043
Alice in Wonderland (2010)                 2.039978
Yes Man (2008)                             2.036678
Holiday, The (2006)                        2.033995
Hangover, The (2009)                       2.030105
Darjeeling Limited, The (2007)             2.013405
I Love You, Man (2009)                     2.002299
Kick-Ass (2010)                            1.995874
Marley & Me 