# Importing libraries

In [6]:
import pandas as pd
from scipy import sparse
from sklearn.metrics import pairwise_distances   # Take care of all those cosine equations

# Load Data Frames

In [7]:
movies = pd.read_csv('movies.dat', sep = '::', names=['movieId', 'title', 'categories'])
movies.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,movieId,title,categories
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings = pd.read_csv('ratings.dat', sep = '::', names=['userId', 'movieId', 'ratings', 'timestape'])
ratings.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,movieId,ratings,timestape
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Drop 'categories' and 'timestape' column from movies and ratings respectively

In [9]:
movies.drop('categories', axis =1, inplace = True)

ratings.drop('timestape', axis =1, inplace = True)

movies.head()


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [11]:
ratings.head()

Unnamed: 0,userId,movieId,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


# Merge Data Frames

In [12]:
movie_rating = pd.merge(movies, ratings, on='movieId')
movie_rating.head()

Unnamed: 0,movieId,title,userId,ratings
0,1,Toy Story (1995),1,5
1,1,Toy Story (1995),6,4
2,1,Toy Story (1995),8,4
3,1,Toy Story (1995),9,5
4,1,Toy Story (1995),10,5


In [13]:
movie_rating.shape

(1000209, 4)

# Create Pivot Table

In [14]:
pivot = movie_rating.pivot_table(index = 'userId', columns = 'title', values = 'ratings')

pivot.head()

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kj�rlighetens kj�tere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [15]:
pivot.shape

(6040, 3706)

# List of all the ratings by the number 1 user (userId 1)

In [16]:
pivot.loc[1, :].sort_values(ascending = False) 

title
Last Days of Disco, The (1998)                  5.0
Ben-Hur (1959)                                  5.0
Mary Poppins (1964)                             5.0
Dumbo (1941)                                    5.0
One Flew Over the Cuckoo's Nest (1975)          5.0
Cinderella (1950)                               5.0
Pocahontas (1995)                               5.0
Christmas Story, A (1983)                       5.0
Bug's Life, A (1998)                            5.0
Saving Private Ryan (1998)                      5.0
Schindler's List (1993)                         5.0
Rain Man (1988)                                 5.0
Beauty and the Beast (1991)                     5.0
Toy Story (1995)                                5.0
Sound of Music, The (1965)                      5.0
Back to the Future (1985)                       5.0
Awakenings (1990)                               5.0
Apollo 13 (1995)                                5.0
Ferris Bueller's Day Off (1986)                 4.0
Fargo 

# All the ratings for a particular movie (Saving Private Ryan (1998))

In [17]:
pivot.loc[:, 'Saving Private Ryan (1998)'].sort_values() 

userId
2813    1.0
3993    1.0
1495    1.0
175     1.0
4883    1.0
4685    1.0
5811    1.0
4539    1.0
3311    1.0
2692    1.0
2748    1.0
4456    1.0
2800    1.0
2847    1.0
5530    1.0
5572    1.0
1102    1.0
1521    1.0
1749    1.0
785     1.0
3145    1.0
3131    1.0
3842    1.0
5603    1.0
5605    1.0
3433    2.0
4133    2.0
1242    2.0
4115    2.0
2432    2.0
       ... 
5995    NaN
5996    NaN
5997    NaN
5999    NaN
6000    NaN
6001    NaN
6004    NaN
6005    NaN
6006    NaN
6011    NaN
6012    NaN
6013    NaN
6014    NaN
6015    NaN
6016    NaN
6017    NaN
6019    NaN
6020    NaN
6021    NaN
6025    NaN
6026    NaN
6028    NaN
6029    NaN
6030    NaN
6031    NaN
6032    NaN
6034    NaN
6035    NaN
6038    NaN
6039    NaN
Name: Saving Private Ryan (1998), Length: 6040, dtype: float64

# Mean of All the ratings for "Saving Private Ryan (1998)" movie

In [18]:
pivot.loc[:, 'Saving Private Ryan (1998)'].mean() 

4.337353938937053

# Create Sparse Matrix

Note: Sparse matrix only keep track of all non NaN values.

In [19]:
pivot_sparse = sparse.csr_matrix(pivot.fillna(0).values)

In [21]:
print pivot_sparse

  (0, 84)	4.0
  (0, 89)	4.0
  (0, 188)	4.0
  (0, 195)	5.0
  (0, 239)	5.0
  (0, 258)	5.0
  (0, 277)	4.0
  (0, 324)	5.0
  (0, 346)	5.0
  (0, 376)	4.0
  (0, 557)	5.0
  (0, 671)	5.0
  (0, 679)	5.0
  (0, 713)	3.0
  (0, 870)	4.0
  (0, 988)	4.0
  (0, 1002)	5.0
  (0, 1007)	4.0
  (0, 1054)	4.0
  (0, 1122)	4.0
  (0, 1147)	4.0
  (0, 1331)	4.0
  (0, 1336)	4.0
  (0, 1528)	4.0
  (0, 1611)	4.0
  :	:
  (6039, 3458)	4.0
  (6039, 3476)	4.0
  (6039, 3479)	4.0
  (6039, 3504)	4.0
  (6039, 3511)	4.0
  (6039, 3517)	3.0
  (6039, 3518)	4.0
  (6039, 3526)	2.0
  (6039, 3541)	3.0
  (6039, 3548)	3.0
  (6039, 3551)	4.0
  (6039, 3574)	1.0
  (6039, 3575)	1.0
  (6039, 3578)	5.0
  (6039, 3583)	4.0
  (6039, 3596)	5.0
  (6039, 3608)	5.0
  (6039, 3615)	1.0
  (6039, 3641)	5.0
  (6039, 3655)	4.0
  (6039, 3656)	5.0
  (6039, 3659)	5.0
  (6039, 3662)	4.0
  (6039, 3685)	5.0
  (6039, 3693)	4.0


# Create pairwise distance matrix (Every movie compare to every other movies (correlation))

In [22]:
distances = pairwise_distances(pivot_sparse.T, metric = 'cosine')

In [23]:
distances.shape

(3706, 3706)

# Convert pairwise distance matrix to DataFrame

In [24]:
distance_df = pd.DataFrame(distances, index = pivot.columns, columns = pivot.columns)

distance_df.head()

title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kj�rlighetens kj�tere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"$1,000,000 Duck (1971)",0.0,0.927643,0.962989,0.920709,0.939162,1.0,0.941381,0.810035,0.827746,0.905215,...,0.961275,0.923526,1.0,0.955926,1.0,0.95472,0.960605,1.0,0.879758,0.972997
'Night Mother (1986),0.927643,0.0,0.88471,0.884455,0.840474,1.0,0.923202,0.852563,0.904078,0.888587,...,0.94699,0.912172,0.936242,0.864038,1.0,0.90885,0.925213,1.0,1.0,0.922193
'Til There Was You (1997),0.962989,0.88471,0.0,0.901244,0.933699,0.91975,0.872105,0.887346,0.87433,0.920885,...,0.9708,0.937107,1.0,0.920813,1.0,0.977406,0.920739,1.0,0.952474,0.936716
"'burbs, The (1989)",0.920709,0.884455,0.901244,0.0,0.85638,1.0,0.807809,0.753073,0.824115,0.829281,...,0.886614,0.792103,0.980038,0.861936,1.0,0.944296,0.838826,1.0,0.966433,0.889475
...And Justice for All (1979),0.939162,0.840474,0.933699,0.85638,0.0,1.0,0.924907,0.805846,0.883621,0.794514,...,0.910002,0.846994,0.932991,0.890971,1.0,0.91392,0.889133,0.925683,1.0,0.88896


# Comparing 'Saving Private Ryan (1998)' movie to the rest of the movies (Return First 11)

In [25]:
distance_df.loc[:, 'Saving Private Ryan (1998)'].sort_values()[1:11]

title
Braveheart (1995)                                        0.345817
Schindler's List (1993)                                  0.375801
Matrix, The (1999)                                       0.376191
Fugitive, The (1993)                                     0.378371
Shawshank Redemption, The (1994)                         0.385128
Star Wars: Episode V - The Empire Strikes Back (1980)    0.387776
Terminator 2: Judgment Day (1991)                        0.390493
Silence of the Lambs, The (1991)                         0.390791
Raiders of the Lost Ark (1981)                           0.400807
Star Wars: Episode IV - A New Hope (1977)                0.404300
Name: Saving Private Ryan (1998), dtype: float64

# Building a function that will find movie title's that match with 'Godfather'

In [26]:
guess = 'Godfather'   
titles = movies.loc[movies['title'].str.contains(guess), 'title'].values
for title in titles:
    print title
    print 'Average Ratings:', pivot[title].mean()
    print 'Number of Ratings:' , pivot[title].count()
    print ''
    print 'Similar movies:', 
    print distance_df[title].sort_values()[1:11]
    print ''
    print ''

Godfather, The (1972)
Average Ratings: 4.52496626181
Number of Ratings: 2223

Similar movies: title
Godfather: Part II, The (1974)                           0.230362
Star Wars: Episode IV - A New Hope (1977)                0.396638
Star Wars: Episode V - The Empire Strikes Back (1980)    0.406930
Raiders of the Lost Ark (1981)                           0.411374
Fargo (1996)                                             0.423501
Jaws (1975)                                              0.435121
Alien (1979)                                             0.445621
GoodFellas (1990)                                        0.445913
Saving Private Ryan (1998)                               0.447108
Terminator, The (1984)                                   0.447472
Name: Godfather, The (1972), dtype: float64


Godfather: Part II, The (1974)
Average Ratings: 4.35756501182
Number of Ratings: 1692

Similar movies: title
Godfather, The (1972)                                    0.230362
Star Wars: Episode 

# Building a function that finds movie title's that match with 'Harry' 

In [27]:
guess = 'Harry'   
titles = movies.loc[movies['title'].str.contains(guess), 'title'].values
for title in titles:
    print title
    print 'Average Ratings:', pivot[title].mean()
    print 'Number of Ratings:' , pivot[title].count()
    print ''
    print 'Similar movies:', 
    print distance_df[title].sort_values(ascending = False)[1:11]
    print ''
    print ''

When Harry Met Sally... (1989)
Average Ratings: 4.07334183673
Number of Ratings: 1568

Similar movies: title
Mutters Courage (1995)                                         1.0
Mirage (1995)                                                  1.0
I, Worst of All (Yo, la peor de todas) (1990)                  1.0
White Boys (1999)                                              1.0
Rent-a-Kid (1995)                                              1.0
Scorta, La (1993)                                              1.0
Resurrection Man (1998)                                        1.0
In God's Hands (1998)                                          1.0
Carriers Are Waiting, The (Les Convoyeurs Attendent) (1999)    1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973)     1.0
Name: When Harry Met Sally... (1989), dtype: float64


Deconstructing Harry (1997)
Average Ratings: 3.41549295775
Number of Ratings: 284

Similar movies: title
Time Tracers (1995)                                         1

# Building a function that finds movie title's that match with 'Alien'

In [28]:
guess = 'Alien'  
titles = movies.loc[movies['title'].str.contains(guess), 'title'].values
for title in titles:
    print title
    print 'Average Ratings:', pivot[title].mean()
    print 'Number of Ratings:' , pivot[title].count()
    print ''
    print 'Similar movies:', 
    print distance_df[title].sort_values()[1:11]
    print ''
    print ''

Aliens (1986)
Average Ratings: 4.12582417582
Number of Ratings: 1820

Similar movies: title
Alien (1979)                                             0.256007
Terminator, The (1984)                                   0.259805
Star Wars: Episode V - The Empire Strikes Back (1980)    0.317103
Predator (1987)                                          0.330657
Terminator 2: Judgment Day (1991)                        0.346826
Blade Runner (1982)                                      0.355283
Star Wars: Episode IV - A New Hope (1977)                0.355760
Star Wars: Episode VI - Return of the Jedi (1983)        0.367781
Matrix, The (1999)                                       0.368420
Total Recall (1990)                                      0.376522
Name: Aliens (1986), dtype: float64


Alien (1979)
Average Ratings: 4.15958498024
Number of Ratings: 2024

Similar movies: title
Aliens (1986)                                            0.256007
Terminator, The (1984)                               