In [40]:
# Importing Libraries

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [41]:
#Importing the datset

ratings = pd.read_csv("ratings.csv")

In [42]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [43]:
ratings.isnull()

Unnamed: 0,userId,movieId,rating,timestamp
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
100831,False,False,False,False
100832,False,False,False,False
100833,False,False,False,False
100834,False,False,False,False


In [44]:
ratings.isnull().count()

userId       100836
movieId      100836
rating       100836
timestamp    100836
dtype: int64

## User-Based Similarity

### 2. Read the “ratings.csv” file and create a pivot table with index=‘userId’, columns=‘movieId’, values = “rating".

In [45]:
rating_pivot = pd.pivot_table(ratings,values = "rating", index = "userId", columns = "movieId")
rating_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [46]:
rating_pivot.fillna(value=0,inplace = True)
rating_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. sklearn.metrics.pairwise_distances can be used to compute distance between all pairs of users. pairwise_distances() takes a metric parameter for what distance measure to use. Use cosine similarity for finding similarity among users.
### Use the following packages.
#### 4.from sklearn.metrics import pairwise_distances
#### 5.from scipy.spatial.distance import cosine, correlation

In [47]:
rating_pivot = rating_pivot.to_numpy()
rating_pair = pairwise_distances(rating_pivot, metric = 'cosine')
rating_pair

array([[0.        , 0.97271713, 0.94027974, ..., 0.70890263, 0.90642807,
        0.85467919],
       [0.97271713, 0.        , 1.        , ..., 0.95378905, 0.9724346 ,
        0.89757325],
       [0.94027974, 1.        , 0.        , ..., 0.97887154, 1.        ,
        0.96788125],
       ...,
       [0.70890263, 0.95378905, 0.97887154, ..., 0.        , 0.87800729,
        0.67794514],
       [0.90642807, 0.9724346 , 1.        , ..., 0.87800729, 0.        ,
        0.94677454],
       [0.85467919, 0.89757325, 0.96788125, ..., 0.67794514, 0.94677454,
        0.        ]])

In [48]:
df = pd.DataFrame(rating_pair)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,0.000000,0.972717,0.940280,0.805605,0.870920,0.871848,0.841256,0.863032,0.935737,0.983125,...,0.919446,0.835545,0.778514,0.929331,0.846375,0.835809,0.730611,0.708903,0.906428,0.854679
1,0.972717,0.000000,1.000000,0.996274,0.983386,0.974667,0.972415,0.972743,1.000000,0.932555,...,0.797329,0.983134,0.988003,1.000000,1.000000,0.971571,0.987052,0.953789,0.972435,0.897573
2,0.940280,1.000000,0.000000,0.997749,0.994980,0.996064,1.000000,0.995059,1.000000,1.000000,...,0.994952,0.995108,0.975008,1.000000,0.989306,0.987007,0.980753,0.978872,1.000000,0.967881
3,0.805605,0.996274,0.997749,0.000000,0.871341,0.911509,0.884880,0.937031,0.988639,0.968837,...,0.914062,0.871727,0.692027,0.947015,0.915416,0.799605,0.868254,0.850142,0.967802,0.892317
4,0.870920,0.983386,0.994980,0.871341,0.000000,0.699651,0.891658,0.570925,1.000000,0.969389,...,0.931952,0.581253,0.889852,0.741227,0.851242,0.893565,0.847134,0.864465,0.738768,0.939208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.835809,0.971571,0.987007,0.799605,0.893565,0.897877,0.799965,0.900612,0.924102,0.911037,...,0.821916,0.883466,0.699331,0.933968,0.851859,0.000000,0.846937,0.737442,0.930378,0.798896
606,0.730611,0.987052,0.980753,0.868254,0.847134,0.837818,0.813886,0.814858,0.988156,0.989549,...,0.907475,0.800090,0.796460,0.862166,0.881220,0.846937,0.000000,0.716919,0.850810,0.860886
607,0.708903,0.953789,0.978872,0.850142,0.864465,0.821191,0.676459,0.812767,0.899565,0.922576,...,0.841645,0.802486,0.767229,0.844694,0.821858,0.737442,0.716919,0.000000,0.878007,0.677945
608,0.906428,0.972435,1.000000,0.967802,0.738768,0.785766,0.909160,0.576007,1.000000,0.978234,...,0.964347,0.664769,0.938059,0.763399,0.902390,0.930378,0.850810,0.878007,0.000000,0.946775


### 6. Find the 5 most similar user for user with user Id 10

In [49]:
top5 = df.loc[10].sort_values(ascending = False)
pd.DataFrame(top5[0:5])

Unnamed: 0,10
305,1.0
441,1.0
391,1.0
396,1.0
397,1.0


### 7. Use the “movies” dataset to find out the names of movies, user 2 and user 338 have watched in common and how they have rated each one of them.

In [50]:
#Importing the datset

movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [51]:
comm = pd.DataFrame(pd.merge(ratings, movies, on="movieId"))
comm

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [52]:
user2_movie = pd.DataFrame(comm[comm["userId"] == 2][["title","rating"]])
user2_movie.head()

Unnamed: 0,title,rating
2267,Tommy Boy (1995),4.0
15657,Gladiator (2000),4.0
16296,"Shawshank Redemption, The (1994)",3.0
16613,Good Will Hunting (1997),4.5
16754,Kill Bill: Vol. 1 (2003),4.0


In [54]:
user338_movie = pd.DataFrame(comm[comm["userId"] == 338][["title","rating"]])
user338_movie.head()

Unnamed: 0,title,rating
692,"Usual Suspects, The (1995)",4.5
1997,Pulp Fiction (1994),4.5
3684,Schindler's List (1993),5.0
4462,"Silence of the Lambs, The (1991)",4.0
14435,Fight Club (1999),4.5


In [56]:
comm_movies = pd.merge(user2_movie,user338_movie,on="title")
comm_movies

Unnamed: 0,title,rating_x,rating_y
0,"Shawshank Redemption, The (1994)",3.0,5.0
1,Kill Bill: Vol. 1 (2003),4.0,4.5


"Shawshank Redemption" and "Kill Bill: Vol.1" are the common movies watched by both user2 and user338

### 8. Use the movies dataset to find out the common movie names between user 2 and user 338 with least rating of 4.0

In [57]:
comm_movies[(comm_movies["rating_x"] >= 4) & ((comm_movies["rating_y"] >= 4))]

Unnamed: 0,title,rating_x,rating_y
1,Kill Bill: Vol. 1 (2003),4.0,4.5


## Item-Based Similarity

### 9. Create a pivot table for representing the similarity among movies using correlation.

In [58]:
df_movie = pd.DataFrame(comm.pivot_table(index='title',columns='userId',values='rating'))
df_movie.corr()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,0.091574,-2.071128e-16,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,,1.000000,,,,,-0.991241,,,0.037796,...,-0.387347,,-1.000000,,,0.583333,,-0.125000,,0.623288
3,0.079819,,1.000000,,,,,,,,...,,,0.433200,,,-0.791334,-0.333333,-0.395092,,0.569562
4,0.207983,,,1.000000,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,-0.222113,3.966413e-01,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,,,-0.336525,1.000000,0.043166,0.158114,0.028347,,-0.777714,...,0.000000,1.533034e-01,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.066378,0.583333,-0.791334,0.144603,0.244321,-0.049192,0.137771,0.253582,0.572700,-0.382955,...,0.290490,1.406134e-01,0.318473,0.682949,0.167062,1.000000,0.114191,0.240842,0.533002,0.389185
607,0.174557,,-0.333333,0.116518,0.231080,0.255639,0.402792,0.251280,,-0.241121,...,0.698241,2.172105e-01,0.192787,0.035806,-0.299641,0.114191,1.000000,0.200814,0.190117,0.106605
608,0.268070,-0.125000,-0.395092,-0.170501,-0.020546,0.125428,0.008081,0.434423,0.336625,-0.571043,...,0.473967,2.976461e-01,0.086423,0.053986,-0.075673,0.240842,0.200814,1.000000,0.488929,0.147606
609,-0.175412,,,-0.277350,0.384111,0.193649,0.420288,0.141860,,,...,1.000000,1.885115e-01,0.343303,0.641624,-0.550000,0.533002,0.190117,0.488929,1.000000,-0.521773


In [59]:
df_movie.fillna(value=0, inplace=True)
df_movie.corr()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.019396,0.053052,0.176911,0.120862,0.104406,0.143785,0.128542,0.055263,-0.000307,...,0.066248,0.149934,0.186959,0.056523,0.134402,0.121958,0.254192,0.262225,0.085430,0.098693
2,0.019396,1.000000,-0.002595,-0.003808,0.013181,0.016252,0.021564,0.023748,-0.003450,0.061877,...,0.198547,0.010885,-0.004038,-0.005348,-0.007923,0.011290,0.005809,0.032723,0.024371,0.089321
3,0.053052,-0.002595,1.000000,-0.004559,0.001886,-0.004581,-0.005637,0.001701,-0.003112,-0.005504,...,0.000148,-0.000588,0.011203,-0.004824,0.003674,-0.003255,0.012881,0.008089,-0.002964,0.015953
4,0.176911,-0.003808,-0.004559,1.000000,0.121014,0.065707,0.100595,0.054231,0.002412,0.015607,...,0.072841,0.114280,0.281852,0.039692,0.065483,0.164812,0.115109,0.116843,0.023926,0.062498
5,0.120862,0.013181,0.001886,0.121014,1.000000,0.294134,0.101721,0.426575,-0.004187,0.023468,...,0.061908,0.414929,0.095386,0.254115,0.141073,0.090149,0.145760,0.122600,0.258288,0.040361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.121958,0.011290,-0.003255,0.164812,0.090149,0.047476,0.172484,0.081904,0.057979,0.054858,...,0.153879,0.084190,0.224593,0.035234,0.106729,1.000000,0.115978,0.188312,0.052375,0.093788
607,0.254192,0.005809,0.012881,0.115109,0.145760,0.142158,0.173287,0.178130,0.003252,-0.004817,...,0.080027,0.187581,0.173008,0.126261,0.101129,0.115978,1.000000,0.258232,0.142529,0.098496
608,0.262225,0.032723,0.008089,0.116843,0.122600,0.137932,0.305429,0.175906,0.086221,0.048357,...,0.136304,0.174056,0.164440,0.133722,0.144878,0.188312,0.258232,1.000000,0.109556,0.248902
609,0.085430,0.024371,-0.002964,0.023926,0.258288,0.207121,0.084491,0.421626,-0.003940,0.014980,...,0.029660,0.331051,0.045991,0.232113,0.089806,0.052375,0.142529,0.109556,1.000000,0.033702


### 10. Find the top 5 movies which are similar to the movie “Godfather”

In [60]:
df_movie[df_movie.index.str.contains('Godfather')]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Godfather, The (1972)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,5.0,0.0,0.0,4.0,4.0,5.0,0.0,5.0
"Godfather: Part II, The (1974)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,4.0,0.0,4.5,0.0,5.0
"Godfather: Part III, The (1990)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
The Godfather Trilogy: 1972-1990 (1992),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tokyo Godfathers (2003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
