In [2]:
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD

In [5]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
users = pd.read_csv('ml-100k/u.data', sep = '\t', names = columns)
users.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [16]:
col_str = '''movie id | movie title | release date | video release date |
              IMDb URL | unknown | Action | Adventure | Animation |
              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western |'''
col_str = col_str.replace('|',',').replace('\n','')
columns = []
for i in col_str.split(','):
    x = i.strip().replace(' ','_').replace('-','_').replace("'",'')
    if x != '':
        columns.append(x)
columns

['movie_id',
 'movie_title',
 'release_date',
 'video_release_date',
 'IMDb_URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Childrens',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film_Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci_Fi',
 'Thriller',
 'War',
 'Western']

In [18]:
movies = pd.read_csv('ml-100k/u.item', sep = '|', names = columns, encoding = 'latin-1')
movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
big_df = pd.merge(users, movies, left_on = 'item_id', right_on = 'movie_id', how = 'inner')
big_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,196,242,3,881250949,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,242,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
big_df.groupby('item_id', as_index = False)['user_id'].count().sort_values('user_id', ascending = False).head()

Unnamed: 0,item_id,user_id
49,50,583
257,258,509
99,100,508
180,181,507
293,294,485


In [30]:
big_df[big_df['item_id']==50].iloc[:1,:6]

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title
50711,290,50,5,880473582,50,Star Wars (1977)


In [63]:
movie_tab = big_df.pivot_table(values='rating', index = 'user_id', columns = 'movie_title', fill_value = 0)
movie_tab.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [64]:
movie_tab.shape

(943, 1664)

In [65]:
X = movie_tab.values.T
X.shape

(1664, 943)

# 12 comp

In [66]:
SVD_model = TruncatedSVD(n_components = 12)

In [67]:
result = SVD_model.fit_transform(X)

In [68]:
result.shape

(1664, 12)

In [69]:
corr_mat = np.corrcoef(result)
corr_mat.shape

(1664, 1664)

In [71]:
movie_list = movie_tab.columns.tolist()
len(movie_list)

1664

In [72]:
star_wars= movie_list.index('Star Wars (1977)')

In [80]:
star_wars_df = pd.DataFrame({'movie' : movie_list,
    'coorelation': corr_mat[star_wars].tolist()})
star_wars_df = star_wars_df[star_wars_df['movie']!= 'Star Wars (1977)']
star_wars_df = star_wars_df.sort_values('coorelation', ascending  = False).reset_index(drop=True)
star_wars_df[star_wars_df['coorelation']>= .9]

Unnamed: 0,movie,coorelation
0,Return of the Jedi (1983),0.988113
1,Terminator 2: Judgment Day (1991),0.937521
2,Toy Story (1995),0.936438
3,"Terminator, The (1984)",0.927054
4,Raiders of the Lost Ark (1981),0.924068
5,"Empire Strikes Back, The (1980)",0.922732
6,"Fugitive, The (1993)",0.913665
7,Die Hard (1988),0.905669


# 10 comp

In [81]:
SVD_model = TruncatedSVD(n_components = 10)
result = SVD_model.fit_transform(X)
corr_mat = np.corrcoef(result)
movie_list = movie_tab.columns.tolist()
star_wars= movie_list.index('Star Wars (1977)')
star_wars_df = pd.DataFrame({'movie' : movie_list,
    'coorelation': corr_mat[star_wars].tolist()})
star_wars_df = star_wars_df[star_wars_df['movie']!= 'Star Wars (1977)']
star_wars_df = star_wars_df.sort_values('coorelation', ascending  = False).reset_index(drop=True)
star_wars_df[star_wars_df['coorelation']>= .9]

Unnamed: 0,movie,coorelation
0,Return of the Jedi (1983),0.989209
1,Terminator 2: Judgment Day (1991),0.95004
2,"Terminator, The (1984)",0.945823
3,Die Hard (1988),0.941495
4,Toy Story (1995),0.935673
5,Raiders of the Lost Ark (1981),0.92955
6,Star Trek: First Contact (1996),0.928196
7,"Empire Strikes Back, The (1980)",0.92148
8,Blade Runner (1982),0.920105
9,"Fugitive, The (1993)",0.918699
