In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
movies = pd.read_csv("data/movies.dat", sep="::", engine="python", encoding='latin-1', index_col="MovieID")
ratings = pd.read_csv("data/ratings.dat", sep="::", engine="python", encoding='latin-1')


In [19]:
movies

Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
...,...,...
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama


In [20]:
df = movies.merge(ratings, how="left", on='MovieID')

In [21]:
df

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,9.788243e+08
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,9.782370e+08
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,9.782335e+08
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,9.782260e+08
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,9.782265e+08
...,...,...,...,...,...,...
1000381,3952,"Contender, The (2000)",Drama|Thriller,5812.0,4.0,9.920721e+08
1000382,3952,"Contender, The (2000)",Drama|Thriller,5831.0,3.0,9.862231e+08
1000383,3952,"Contender, The (2000)",Drama|Thriller,5837.0,4.0,1.011903e+09
1000384,3952,"Contender, The (2000)",Drama|Thriller,5927.0,1.0,9.798525e+08


In [31]:
comment_counts = pd.DataFrame(df["Title"].value_counts())

In [32]:
comment_counts.head(20)

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
American Beauty (1999),3428
Star Wars: Episode IV - A New Hope (1977),2991
Star Wars: Episode V - The Empire Strikes Back (1980),2990
Star Wars: Episode VI - Return of the Jedi (1983),2883
Jurassic Park (1993),2672
Saving Private Ryan (1998),2653
Terminator 2: Judgment Day (1991),2649
"Matrix, The (1999)",2590
Back to the Future (1985),2583
"Silence of the Lambs, The (1991)",2578


In [33]:
rare_movies = comment_counts[comment_counts["count"] <= 70].index

In [34]:
rare_movies

Index(['Braindead (1992)',
       'Girl on the Bridge, The (La Fille sur le Pont) (1999)',
       'Broken Hearts Club, The (2000)', 'Play it to the Bone (1999)',
       'Governess, The (1998)', ''Night Mother (1986)', 'Carrington (1995)',
       'Soldier's Daughter Never Cries, A (1998)',
       'House of the Spirits, The (1993)', 'Repulsion (1965)',
       ...
       'Baton Rouge (1988)', 'Mascara (1999)', 'To Cross the Rubicon (1991)',
       'Outside Ozona (1998)', 'Reach the Rock (1997)',
       'Portraits Chinois (1996)', 'For Ever Mozart (1996)',
       'Innocent Sleep, The (1995)', 'Mondo (1996)',
       'Prince of Central Park, The (1999)'],
      dtype='object', name='Title', length=1621)

In [36]:
common_movies = df[~df["Title"].isin(rare_movies)]
common_movies

Unnamed: 0,MovieID,Title,Genres,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,5.0,9.788243e+08
1,1,Toy Story (1995),Animation|Children's|Comedy,6.0,4.0,9.782370e+08
2,1,Toy Story (1995),Animation|Children's|Comedy,8.0,4.0,9.782335e+08
3,1,Toy Story (1995),Animation|Children's|Comedy,9.0,5.0,9.782260e+08
4,1,Toy Story (1995),Animation|Children's|Comedy,10.0,5.0,9.782265e+08
...,...,...,...,...,...,...
1000381,3952,"Contender, The (2000)",Drama|Thriller,5812.0,4.0,9.920721e+08
1000382,3952,"Contender, The (2000)",Drama|Thriller,5831.0,3.0,9.862231e+08
1000383,3952,"Contender, The (2000)",Drama|Thriller,5837.0,4.0,1.011903e+09
1000384,3952,"Contender, The (2000)",Drama|Thriller,5927.0,1.0,9.798525e+08


In [39]:
user_movie_df = common_movies.pivot_table(index=["UserID"], columns=["Title"], values="Rating")

In [40]:
user_movie_df.head()

Title,"'burbs, The (1989)",...And Justice for All (1979),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",2 Days in the Valley (1996),20 Dates (1998),"20,000 Leagues Under the Sea (1954)",...,You've Got Mail (1998),Young Doctors in Love (1982),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Your Friends and Neighbors (1998),Zero Effect (1998),eXistenZ (1999)
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,5.0,4.0,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


In [62]:
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=47).values)
random_user

  random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=47).values)


5237

In [63]:
random_user_df = user_movie_df[user_movie_df.index == random_user]
len(random_user_df.columns)

2262

In [64]:
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
len(movies_watched)

55

In [65]:
movies_watched

['Air Force One (1997)',
 'Alien (1979)',
 'Batman (1989)',
 'Benny & Joon (1993)',
 'Crocodile Dundee (1986)',
 'Dances with Wolves (1990)',
 'Die Hard (1988)',
 'Easy Rider (1969)',
 'Escape from New York (1981)',
 'Eyes Wide Shut (1999)',
 "Ferris Bueller's Day Off (1986)",
 'Fox and the Hound, The (1981)',
 'Galaxy Quest (1999)',
 'GoldenEye (1995)',
 'Goldfinger (1964)',
 'Good Will Hunting (1997)',
 'Green Mile, The (1999)',
 'Grumpy Old Men (1993)',
 'Highlander (1986)',
 'Indiana Jones and the Last Crusade (1989)',
 'Indiana Jones and the Temple of Doom (1984)',
 'Jaws (1975)',
 'Jurassic Park (1993)',
 'King Kong (1933)',
 'Last of the Mohicans, The (1992)',
 'Liar Liar (1997)',
 'Lost World: Jurassic Park, The (1997)',
 'Mad Max (1979)',
 'Mission: Impossible (1996)',
 'Monty Python and the Holy Grail (1974)',
 'Mummy, The (1999)',
 'Pelican Brief, The (1993)',
 'Predator (1987)',
 'Raiders of the Lost Ark (1981)',
 'Rock, The (1996)',
 'Rocky (1976)',
 'Romancing the Stone (

In [66]:
movies_watched_df = user_movie_df[movies_watched]


In [67]:
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count

UserID
1.0        5
2.0       15
3.0       16
4.0       14
5.0        8
          ..
6036.0    34
6037.0    15
6038.0     3
6039.0     4
6040.0    21
Length: 6040, dtype: int64

In [68]:
user_movie_count = user_movie_count.reset_index()
user_movie_count

Unnamed: 0,UserID,0
0,1.0,5
1,2.0,15
2,3.0,16
3,4.0,14
4,5.0,8
...,...,...
6035,6036.0,34
6036,6037.0,15
6037,6038.0,3
6038,6039.0,4


In [69]:
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count

Unnamed: 0,userId,movie_count
0,1.0,5
1,2.0,15
2,3.0,16
3,4.0,14
4,5.0,8
...,...,...
6035,6036.0,34
6036,6037.0,15
6037,6038.0,3
6038,6039.0,4


In [70]:
user_movie_count[user_movie_count["movie_count"] > 20].sort_values("movie_count", ascending=False).head(20)

Unnamed: 0,userId,movie_count
5366,5367.0,55
5236,5237.0,55
1940,1941.0,54
5830,5831.0,54
2908,2909.0,52
5794,5795.0,52
1448,1449.0,52
1180,1181.0,52
4168,4169.0,52
1446,1447.0,51


In [71]:
user_movie_count[user_movie_count["movie_count"] == 55].count()

userId         2
movie_count    2
dtype: int64

In [72]:
users_same_movies = user_movie_count[user_movie_count["movie_count"] > 20]["userId"]
users_same_movies.head()

9     10.0
12    13.0
17    18.0
18    19.0
21    22.0
Name: userId, dtype: float64

In [73]:
users_same_movies = user_movie_count[user_movie_count["movie_count"] > 20]["userId"]
users_same_movies.head()

9     10.0
12    13.0
17    18.0
18    19.0
21    22.0
Name: userId, dtype: float64

In [74]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      random_user_df[movies_watched]])
final_df.head()

Title,Air Force One (1997),Alien (1979),Batman (1989),Benny & Joon (1993),Crocodile Dundee (1986),Dances with Wolves (1990),Die Hard (1988),Easy Rider (1969),Escape from New York (1981),Eyes Wide Shut (1999),...,Star Wars: Episode VI - Return of the Jedi (1983),Superman (1978),Superman II (1980),Terminator 2: Judgment Day (1991),"Terminator, The (1984)",Total Recall (1990),Toy Story 2 (1999),True Lies (1994),Who Framed Roger Rabbit? (1988),"Wizard of Oz, The (1939)"
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.0,,4.0,4.0,5.0,5.0,5.0,,,4.0,,...,4.0,5.0,,4.0,5.0,,4.0,,,5.0
13.0,,,3.0,,3.0,4.0,,,3.0,,...,4.0,3.0,3.0,5.0,,3.0,,3.0,3.0,4.0
18.0,,,4.0,,3.0,4.0,3.0,,,,...,5.0,,,5.0,5.0,3.0,,3.0,5.0,5.0
19.0,3.0,2.0,4.0,,,,5.0,,,,...,4.0,3.0,,4.0,3.0,,4.0,4.0,4.0,4.0
22.0,1.0,4.0,2.0,2.0,2.0,,4.0,,,,...,3.0,,,4.0,4.0,4.0,,4.0,4.0,3.0


In [77]:
corr_df = final_df.drop_duplicates().T.corr().unstack().sort_values()

In [78]:
corr_df

UserID  UserID
4819.0  713.0    -1.0
713.0   4819.0   -1.0
4373.0  575.0    -1.0
2380.0  808.0    -1.0
669.0   2943.0   -1.0
                 ... 
6040.0  272.0     NaN
        1132.0    NaN
        1835.0    NaN
        3414.0    NaN
        3902.0    NaN
Length: 1833316, dtype: float64

In [None]:
corr_df = pd.DataFrame(corr_df, columns=["corr"])