In [1]:
import numpy as np 
import pandas as pd

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error

sathwick18 from GitHub
AnmolNarang from Kaggle

# Load dataset 

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
movieid_dict = pd.Series(data = movies.title.values, index=movies.movieId).to_dict()

In [5]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
5,1,110,4.0,1217896150
6,1,150,3.0,1217895940
7,1,161,4.0,1217897864
8,1,165,3.0,1217897135
9,1,204,0.5,1217895786


# Preprocessing

In [6]:
#Remove the year from title and add onto a separate column
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies['year']=movies.year.str.extract('(\d\d\d\d)',expand=False)
movies['title']=movies.title.str.replace('(\(\d\d\d\d\))','')
movies['title']=movies.title.apply(lambda x:x.strip())
#movies_df['title']=movies_df.title.str.strip() is an alternative 
movies.head()

  after removing the cwd from sys.path.


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
def clean_feature_and_return_ndarray(genres):
    lemmatizer = WordNetLemmatizer()
    li=[]
    for i in range(len(genres)):
        temp = genres[i].lower()
        temp = temp.split("|")
        temp = [lemmatizer.lemmatize(word) for word in temp]
        li.append(" ".join(temp))
        
    cv = CountVectorizer()
    genres_matrix = pd.DataFrame(cv.fit_transform(li).toarray(),
                                 columns=cv.get_feature_names(),)
    return genres_matrix

In [8]:
genres_matrix = clean_feature_and_return_ndarray(movies['genres'])

In [9]:
movies_with_genres_df = pd.concat([movies.drop(columns='genres'),genres_matrix], axis= 1)
movies_with_genres_df.head()

Unnamed: 0,movieId,title,year,action,adventure,animation,child,comedy,crime,documentary,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
movie_genre_matrix = movies_with_genres_df.set_index('movieId').drop(columns=['title','year'])
movie_genre_matrix.head()

Unnamed: 0_level_0,action,adventure,animation,child,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Recommendation

In [11]:
def get_user_ratings(uid):
    """Get the id of the highest rated movie watched by user from the rating dataset"""
    rate_list = ratings.loc[ratings["userId"]==uid,["movieId","rating"]]
    return rate_list.sort_values(by="rating", ascending=False)

def get_user_genre_matrix(user_ratings):
    user_genre_matrix = movies_with_genres_df[movies_with_genres_df['movieId'].isin(user_ratings['movieId'].tolist())]
    user_genre_matrix = user_genre_matrix.drop(columns=['title','movieId','year','genres'])
    return user_genre_matrix

def get_user_profile(user_ratings, user_genre_matrix):
    user_profile = user_genre_matrix.T.values @ user_ratings['rating']
    user_profile = pd.Series(data=user_profile, 
                             index=user_genre_matrix.columns)
    return user_profile

def get_movie_by_id(mv_id):
    return movieid_dict[mv_id]

## Get user rating data 

In [12]:
user1 = get_user_ratings(2)
user1.sort_values(by="rating", ascending=False)[:10]

Unnamed: 0,movieId,rating
113,1,5.0
139,805,5.0
117,17,5.0
120,36,5.0
123,62,5.0
130,608,5.0
141,1356,4.0
116,14,4.0
121,52,4.0
122,58,4.0


## Get user-genre matrix

In [13]:
user_genre_matrix = get_user_genre_matrix(user1)
user_genre_matrix

Unnamed: 0,action,adventure,animation,child,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
24,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
31,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,1,0,0
35,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
48,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
53,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


## Compute user genre weight vector 

In [14]:
user_profile = get_user_profile(user1, user_genre_matrix)
user_profile

action         33.0
adventure      38.0
animation       9.0
child          11.0
comedy         43.0
crime          12.0
documentary     0.0
drama          45.0
fantasy        14.0
fi             16.0
film            0.0
horror          0.0
imax            0.0
listed          0.0
musical         6.0
mystery         8.0
no              0.0
noir            0.0
romance        33.0
sci            16.0
thriller       43.0
war             0.0
western         0.0
dtype: float64

## Calculate predicted rating with movie genre matrix 

In [15]:
movie_genre_matrix.head()

Unnamed: 0_level_0,action,adventure,animation,child,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
predicted_ratings = ((movie_genre_matrix*user_profile).sum(axis=1))*5/(user_profile.sum())
predicted_ratings

movieId
1         1.758410
2         0.963303
3         1.162080
4         1.850153
5         0.657492
            ...   
146684    0.963303
146878    0.657492
148238    0.657492
148626    0.688073
149532    0.000000
Length: 10329, dtype: float64

## Bring it all together with function 

In [17]:
def predict_rating(uid):
    rating = get_user_ratings(uid)
    genre_matrix = get_user_genre_matrix(rating)
    user_profile = get_user_profile(rating,genre_matrix)
    
    predicted_rating = ((movie_genre_matrix*user_profile).sum(axis=1))*5/(user_profile.sum())
    
    return predicted_rating

def recommend_new_movie(userID, no_of_movies = 10):
    """ Get recommendation lists for new movies """
    watched_movie_id = get_user_ratings(userID)['movieId'].values #Get the ID of watched movie
    predicted_ratings = predict_rating(userID) #Get predicted movie from users
    
    movies_not_watched = predicted_ratings.drop(watched_movie_id) #Remove watched movie from the predicted movie list
    
    movie_names = list(map(get_movie_by_id,movies_not_watched.index)) #Get movies titles
    movies_not_watched.index = movie_names #Assign the movie titles as the new index
    movies_not_watched.sort_values(ascending = False, inplace = True) # Sort ratings from high to low
    
    return movies_not_watched.head(no_of_movies)

In [18]:
recommend_new_movie(2, 20)

Stunt Man, The (1980)                                                  3.593272
Lupin III: Sweet Lost Night (Rupan Sansei: Sweet Lost Night) (2008)    3.455657
Lupin III: First Contact (Rupan Sansei: Faasuto Kontakuto) (2002)      3.455657
Interstate 60 (2002)                                                   3.409786
Rubber (2010)                                                          3.394495
Osmosis Jones (2001)                                                   3.333333
Getaway, The (1994)                                                    3.119266
Chase, The (1994)                                                      3.088685
Hunting Party, The (2007)                                              3.088685
Casanova (2005)                                                        2.935780
King Solomon's Mines (1937)                                            2.935780
Northwest Passage (1940)                                               2.935780
Day After Tomorrow, The (2004)          

# Evaluate 

In [19]:
def evaluate(userID):
    user_rating_df = ratings[ratings.userId == userID] # Get user rating df
    
    true_rating = user_rating_df['rating'] # User rating
    
    user_rating = predict_rating(userID)
    predicted_rating = user_rating_df['movieId'].apply(lambda x:user_rating[x]) # Get rating from recommendation
    
    MAE = mean_absolute_error(true_rating,predicted_rating) 
    
    return MAE

## Evaluate on top 20 users with most reviews  

In [20]:
# get the users that watch the most movies
test_usersID = ratings['userId'].value_counts().head(20)
test_usersID = test_usersID.index

test_usersID

Int64Index([668, 575, 458, 232, 310, 475, 128, 224, 607,  63, 451, 627, 413,
             62, 461, 164, 354, 402, 220, 109],
           dtype='int64')

In [21]:
MAE_from_top_20_users = []

for user in test_usersID:
    MAE_from_top_20_users.append(evaluate(user))
    
MAE_table = pd.Series(data = MAE_from_top_20_users, index=test_usersID)
pd.DataFrame(MAE_table, columns = ["MAE"])
CB_MAE_table = pd.DataFrame(MAE_table, columns = ["MAE"])
CB_MAE_table.index.name = "user_Id"
CB_MAE_table

Unnamed: 0_level_0,MAE
user_Id,Unnamed: 1_level_1
668,1.593775
575,1.728695
458,2.048477
232,2.656871
310,1.681317
475,2.047654
128,2.096211
224,2.331586
607,2.033536
63,1.767417


In [22]:
print("mean:",MAE_table.mean())
print("std:",MAE_table.std())

mean: 2.0737807395119505
std: 0.4024415404630671


In [None]:
MAE_table.hist()