# Item-Based Collaborative Filtering

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 500)

# Section I - Data Preparation

# Step #1 
Read Data

In [6]:
movie = pd.read_csv('datasets/movie_lens_dataset/movie.csv')
rating = pd.read_csv('datasets/movie_lens_dataset/rating.csv')


### Step #2
Merge data

In [8]:
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


# Section II - User Movie Dataframe Preparation

In [9]:
df["title"].nunique()

27262

In [11]:
df["title"].value_counts().head()

Pulp Fiction (1994)                 67310
Forrest Gump (1994)                 66172
Shawshank Redemption, The (1994)    63366
Silence of the Lambs, The (1991)    63299
Jurassic Park (1993)                59715
Name: title, dtype: int64

In [12]:
# Get comment counts
comment_counts = pd.DataFrame(df["title"].value_counts())

In [13]:
# Drop rare movies
rare_movies = comment_counts[comment_counts["title"] <= 1000].index


In [14]:
# Check Common movies
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape

(17766015, 6)

In [15]:
common_movies["title"].nunique()

3159

In [16]:
df["title"].nunique()

27262

In [17]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [18]:
user_movie_df.shape

(138493, 3159)

In [19]:
user_movie_df.columns

Index([''burbs, The (1989)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '10 Things I Hate About You (1999)', '10,000 BC (2008)',
       '101 Dalmatians (1996)',
       '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
       '102 Dalmatians (2000)', '12 Angry Men (1957)',
       ...
       'Zero Dark Thirty (2012)', 'Zero Effect (1998)', 'Zodiac (2007)',
       'Zombieland (2009)', 'Zoolander (2001)', 'Zulu (1964)', '[REC] (2007)',
       'eXistenZ (1999)', 'xXx (2002)', '¡Three Amigos! (1986)'],
      dtype='object', name='title', length=3159)

# Section III - Item-Based Movie Recommendations

In [23]:
movie_name = "Matrix, The (1999)"
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

title
Matrix, The (1999)                                           1.000000
Matrix Reloaded, The (2003)                                  0.516906
Matrix Revolutions, The (2003)                               0.449588
Animatrix, The (2003)                                        0.367151
Blade (1998)                                                 0.334493
Terminator 2: Judgment Day (1991)                            0.333882
Minority Report (2002)                                       0.332434
Edge of Tomorrow (2014)                                      0.326762
Mission: Impossible (1996)                                   0.320815
Lord of the Rings: The Fellowship of the Ring, The (2001)    0.318726
dtype: float64

In [24]:
movie_name = pd.Series(user_movie_df.columns).sample(1).values[0]
movie_name = user_movie_df[movie_name]
user_movie_df.corrwith(movie_name).sort_values(ascending=False).head(10)

title
Sullivan's Travels (1941)                                               1.000000
Fireworks (Hana-bi) (1997)                                              0.607394
Throne of Blood (Kumonosu jô) (1957)                                    0.579674
Sanjuro (Tsubaki Sanjûrô) (1962)                                        0.567440
How the Grinch Stole Christmas! (1966)                                  0.562402
Death at a Funeral (2007)                                               0.551511
Prophet, A (Un Prophète) (2009)                                         0.548685
Nausicaä of the Valley of the Wind (Kaze no tani no Naushika) (1984)    0.535331
Lady Eve, The (1941)                                                    0.532797
My Man Godfrey (1936)                                                   0.530149
dtype: float64

In [25]:
def check_film(keyword, user_movie_df):
    return [col for col in user_movie_df.columns if keyword in col]

check_film("Insomnia", user_movie_df)

['Insomnia (1997)', 'Insomnia (2002)']

# Section IV - Project Script

In [27]:
def create_user_movie_df():
    import pandas as pd
    movie = pd.read_csv('datasets/movie_lens_dataset/movie.csv')
    rating = pd.read_csv('datasets/movie_lens_dataset/rating.csv')
    df = movie.merge(rating, how="left", on="movieId")
    comment_counts = pd.DataFrame(df["title"].value_counts())
    rare_movies = comment_counts[comment_counts["title"] <= 1000].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df

user_movie_df = create_user_movie_df()

In [31]:
def item_based_recommender(movie_name, user_movie_df):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False)[1:10]

item_based_recommender("Matrix, The (1999)", user_movie_df)

title
Matrix Reloaded, The (2003)                                  0.516906
Matrix Revolutions, The (2003)                               0.449588
Animatrix, The (2003)                                        0.367151
Blade (1998)                                                 0.334493
Terminator 2: Judgment Day (1991)                            0.333882
Minority Report (2002)                                       0.332434
Edge of Tomorrow (2014)                                      0.326762
Mission: Impossible (1996)                                   0.320815
Lord of the Rings: The Fellowship of the Ring, The (2001)    0.318726
dtype: float64

In [39]:
movie_name = pd.Series(user_movie_df.columns).sample(1).values[0]
print(movie_name)
item_based_recommender(movie_name, user_movie_df)

Out-of-Towners, The (1999)


title
[REC] (2007)                                                            0.706472
Ghost in the Shell 2: Innocence (a.k.a. Innocence) (Inosensu) (2004)    0.672256
Jumper (2008)                                                           0.663527
Expendables, The (2010)                                                 0.640561
Look Who's Talking Too (1990)                                           0.639817
Just Married (2003)                                                     0.612288
Flightplan (2005)                                                       0.602856
Beethoven (1992)                                                        0.601059
S.W.A.T. (2003)                                                         0.592729
dtype: float64