# MOVIE RECOMMENDATION SYSTEM
## Hack It Hackathon
### Lakshay Arora(laarora) & Pranjal Surana(prasuran)

In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.decomposition import TruncatedSVD

### Preparing the data

In [2]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv('u.data', sep='\t', names=columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [6]:
filter = combined_movies_data['item_id']==50
combined_movies_data[filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

### Building a Utility Matrix

In [7]:
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


### Transposing the Matrix

In [8]:
rating_crosstab.shape

(943, 1664)

In [9]:
X = rating_crosstab.T
X

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900 (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187 (1997),0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Guns II (1990),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
"Young Poisoner's Handbook, The (1995)",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zeus and Roxanne (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unknown,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Decomposing the Matrix

In [10]:
SVD = TruncatedSVD(n_components=12, random_state=17)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix.shape

(1664, 12)

In [11]:
resultant_matrix

array([[ 1.03999361e+00,  6.59899708e-01,  4.56859274e-02, ...,
        -7.33808694e-01, -3.07421385e-01, -5.02487842e-01],
       [ 4.36584337e-01, -2.57258527e-01,  3.52956364e-01, ...,
        -2.35075707e-01,  3.56347813e-01, -9.50982027e-02],
       [ 1.25437438e+01,  5.66918707e+00, -4.90783556e+00, ...,
         3.86562509e+00,  5.90287093e-01, -8.41714795e-01],
       ...,
       [ 3.58929614e-01,  3.71252120e-01,  2.29706590e-02, ...,
        -7.59160967e-02,  1.83632841e-01, -3.40754159e-02],
       [ 1.42428013e+00,  8.14958817e-01, -4.90234641e-01, ...,
         1.58403380e-01,  5.52850038e-01, -6.16944743e-01],
       [ 2.29210339e-01, -6.22315746e-03,  2.73168369e-01, ...,
         8.24582381e-02, -7.02952067e-02, -1.69566591e-01]])

### Generating a Correlation Matrix

In [13]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat

array([[ 1.        , -0.10875097,  0.52265963, ...,  0.39286484,
         0.21820479,  0.51111162],
       [-0.10875097,  1.        ,  0.0653973 , ...,  0.15737062,
         0.51273503,  0.24591037],
       [ 0.52265963,  0.0653973 ,  1.        , ...,  0.76769554,
         0.44401258,  0.20018928],
       ...,
       [ 0.39286484,  0.15737062,  0.76769554, ...,  1.        ,
         0.18141816,  0.11120279],
       [ 0.21820479,  0.51273503,  0.44401258, ...,  0.18141816,
         1.        ,  0.1940996 ],
       [ 0.51111162,  0.24591037,  0.20018928, ...,  0.11120279,
         0.1940996 ,  1.        ]])

### Isolating Star Wars From the Correlation Matrix

In [14]:
movie_names = rating_crosstab.columns
movies_list = list(movie_names)

movie = movies_list.index('12 Angry Men (1957)')
movie

3

In [15]:
corr_movie = corr_mat[movie,:]
corr_movie.shape

(1664,)

### Recommending a Highly Correlated Movie

In [16]:
list(movie_names[(corr_movie<1.0) & (corr_movie > 0.9)])

['2001: A Space Odyssey (1968)',
 'African Queen, The (1951)',
 'Amadeus (1984)',
 'Annie Hall (1977)',
 'Apartment, The (1960)',
 'Arsenic and Old Lace (1944)',
 'Beans of Egypt, Maine, The (1994)',
 'Being There (1979)',
 'Bonnie and Clyde (1967)',
 "Breakfast at Tiffany's (1961)",
 'Bridge on the River Kwai, The (1957)',
 'Bringing Up Baby (1938)',
 'Butch Cassidy and the Sundance Kid (1969)',
 'Casablanca (1942)',
 'Chinatown (1974)',
 'Citizen Kane (1941)',
 'Cool Hand Luke (1967)',
 'Deer Hunter, The (1978)',
 'Dial M for Murder (1954)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 'East of Eden (1955)',
 'Gandhi (1982)',
 'Gone with the Wind (1939)',
 'Graduate, The (1967)',
 'Great Escape, The (1963)',
 'High Noon (1952)',
 'His Girl Friday (1940)',
 'It Happened One Night (1934)',
 "It's a Wonderful Life (1946)",
 'Lawrence of Arabia (1962)',
 'Maltese Falcon, The (1941)',
 'Man Who Would Be King, The (1975)',
 'Manchurian Candidate, The (19

In [17]:
list(movie_names[(corr_movie<1.0) & (corr_movie > 0.95)])

['Casablanca (1942)',
 'Citizen Kane (1941)',
 'Gandhi (1982)',
 'Graduate, The (1967)',
 'North by Northwest (1959)',
 'Rear Window (1954)',
 "Singin' in the Rain (1952)",
 'Some Like It Hot (1959)',
 'To Kill a Mockingbird (1962)',
 'Vertigo (1958)']