In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
!pip3 install surprise

In [61]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict

In [2]:
movies = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/movies.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/tags.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/ratings.csv')
users = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/users.csv')

movies = movies.iloc[: , 1:]
tags = tags.iloc[: , 1:]
ratings = ratings.iloc[: , 1:]
users = users.iloc[: , 1:]

In [3]:
main = pd.DataFrame(ratings)

# Feature 1: Content-Based Filtering 

Get all the movies that a user likes - and use content-based filtering to generate a list of similar movies that the user might also like 

The movies get grouped by analysing the tags associated with each movie (tags are a combination of user comments and movie genres)

Cast the columns in movies and tags

In [4]:
tags = tags.astype({'tags' : 'string'})
tags = tags.astype({'title' : 'string'})
tags = tags.astype({'genres' : 'string'})
tags.dtypes

movie_id     int64
tags        string
title       string
genres      string
year         int64
dtype: object

In [5]:
movies = movies.astype({'title' : 'string'})
movies = movies.astype({'genres' : 'string'})
movies.dtypes

movie_id     int64
title       string
genres      string
year         int64
dtype: object

In [6]:
# an example of a tag
tag_example = tags['tags'][0]
tag_example

"pixar pixar pixar animation pixar animated fun toy toys pixar rated g pixar pixar national film registry time travel pixar pixar funny imdb top  animation very good ya boy pixar time travel animation erlend s dvds cgi disney family pixar toys bright daring rescues fanciful heroic mission humorous light rousing toys come to life unlikely friendships warm witty pixar the boys almost favorite é ä é animation children disney pixar clever want to see again cartoon disney d computer animation disney animated feature pixar animation want buddy movie animation pixar tim allen tom hanks animation disney pixar toys computer animation cartoon pixar animation computer animation pixar toys tumey s to see again tumey s vhs classic disney engaging avi buy action figure action figures buzz lightyear cg animation toy toys woody disney pixar tim allen tom hanks animation pixar animation kids movie pixar tim allen time travel animation cgi adventure animation children comedy animation pixar lots of hear

In [7]:
# make sure that every movie has a tag
tags.isna().sum()

movie_id    0
tags        0
title       0
genres      0
year        0
dtype: int64

In [8]:
# how many ratings we have 
print(len(ratings['movie_id']))

# how many unique movies we have in ratings
movies_in_ratings = ratings['movie_id'].drop_duplicates()
print(len(movies_in_ratings))

1000209
3706


In [9]:
# it's redundant to draw information about movies that are not in the ratings df, so I will drop them 
print("Tags shape before dropping extra movies: " + str(tags.shape))
tags = tags[tags.movie_id.map(lambda x: np.isin(x, movies_in_ratings).all())]
print("Tags shape after dropping extra movies: " + str(tags.shape))
# drop any movie that we have twice
tags = tags.drop_duplicates(subset=['title'])
print("Tags shape after dropping duplicate movies: " + str(tags.shape))

Tags shape before dropping extra movies: (3883, 5)
Tags shape after dropping extra movies: (3706, 5)
Tags shape after dropping duplicate movies: (3664, 5)


## Content-Based Filtering Engine

In [10]:
tags = tags.reset_index()
tags = tags.drop(columns=['index'])
tags.tail()

When building the matrix, the issue is that since the length of the tags dataframe is not the same as the maximum index of movie in the dataframe

In [11]:
print("max movie id: " + str(tags['movie_id'].max()))
print("length of the tags dataframe: " + str(len(tags['movie_id'])))

max movie id: 3952
length of the tags dataframe: 3664


In [49]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
matrix = tf.fit_transform(tags['tags'])

cosine_similarities = linear_kernel(matrix,matrix)
movie_title = tags['title']
indices = pd.Series(tags.index, index=tags['title'])

def recommend_movies(original_title):

    idx = indices[original_title]
    sim_scores = list(enumerate(cosine_similarities[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:51]

    movie_indices = [i[0] for i in sim_scores]
    titles = movie_title.iloc[movie_indices]
    return titles, movie_indices

In [50]:
t, i  = recommend_movies('Clueless')
print(t)

27                                             Persuasion
2833                                       Mansfield Park
786                                                  Emma
16                                  Sense and Sensibility
62                                          Two if by Sea
113                                          If Lucy Fell
278                                    Nina Takes a Lover
285                                     Perez Family, The
286                            Pyromaniac's Love Story, A
368                                            Speechless
433                                            Favor, The
552                                     Naked in New York
761       Rendezvous in Paris (Rendez-vous de Paris, Les)
1341                        Beautician and the Beast, The
1343                                        Hotel de Love
1349                                     That Old Feeling
1353                                           Booty Call
1422          

Clueless is based on 'Emma' by Jane Austen, Persuasion is also a Jane Austen book, therefore I think CBF works quite well :) 

## Generate similar movies

In [51]:
# any movies in ratings that we don't have tags for? 
print(list(set(ratings['movie_id']) - set(tags['movie_id'])))
movies_in_tags = tags['movie_id'].drop_duplicates()

[2561, 2820, 2565, 2059, 3723, 3598, 915, 1941, 2455, 3096, 2078, 2719, 2848, 3616, 3744, 2212, 2085, 2729, 1460, 3126, 2364, 957, 2367, 1344, 3784, 2633, 2634, 2764, 3404, 3022, 2512, 1873, 2389, 2645, 2135, 2136, 2398, 3935, 3947, 3823, 2553, 2430]


for each movie in ratings, generate a list of n similar movies

In [52]:
similar_movies_map = {} 
no_similar_movies_count = 0

# getting rid of any movies we don't have tags for 
movies_in_ratings = list(set(movies_in_ratings) and set(movies_in_tags))

for movie in movies_in_ratings: 
  title = str(tags[tags['movie_id'] == movie]['title'].values[0])
  # actual_index = tags.index[tags['movie_id'] == movie]
  t, ind = recommend_movies(title)
  similar_movies_map[movie] = ind;
  if (len(ind) == 0): 
    no_similar_movies_count += 1

print("No similar movies found for " + str(no_similar_movies_count) + " movies")

No similar movies found for 0 movies


## Add feature to dataframe

get favourite movies by each user

In [53]:
#  get movies that users rated positively 
positive_ratings = pd.DataFrame()
positive_ratings['user_id'] = users['user_id']

In [54]:
positive_ratings['movies'] = None

for index, row in positive_ratings.iterrows():
    user = int(row['user_id'])
    # get a list of movies that the user rated with 4 or 5
    rated_movies = list(ratings.loc[(ratings['user_id'] == user) & (ratings['rating'] >= 3)]['movie_id'].values)
    positive_ratings.at[index, 'movies'] = rated_movies


In [55]:
positive_ratings.head()

Unnamed: 0,user_id,movies
0,1,"[1193, 661, 914, 3408, 2355, 1197, 1287, 2804,..."
1,2,"[1357, 3068, 1537, 647, 2194, 648, 2268, 2628,..."
2,3,"[3421, 648, 1394, 3534, 104, 2735, 1210, 1431,..."
3,4,"[3468, 1210, 2951, 1214, 1036, 260, 2028, 480,..."
4,5,"[2987, 2333, 1175, 39, 2337, 1535, 1392, 1466,..."


In [56]:
positive_ratings['CBF'] = None
count = 0
for index, row in positive_ratings.iterrows(): 
    if (index % 1000 == 0): 
        print(index)
    movies = list(row['movies'])
    all_similar_movies = []
    # for each movie that the user likes 
    for movie in movies: 
        # get similar movies and add the
        similar_movies = similar_movies_map.get(int(movie))
        if (similar_movies != None): 
            all_similar_movies += similar_movies 
    positive_ratings.at[index, 'CBF'] = all_similar_movies

0
1000
2000
3000
4000
5000
6000


In [40]:
positive_ratings.head()

Unnamed: 0,user_id,movies,CBF
0,1,"[1193, 3408, 2355, 1287, 2804, 594, 919, 595, ...","[844, 1755, 1772, 1750, 1756, 1752, 1744, 1748..."
1,2,"[1357, 3068, 1537, 2194, 648, 2268, 3468, 1210...","[911, 2233, 126, 176, 484, 598, 660, 736, 922,..."
2,3,"[3421, 1394, 104, 2735, 1210, 1079, 1615, 1291...","[1128, 315, 3054, 3196, 2561, 1203, 2562, 342,..."
3,4,"[3468, 2951, 1214, 1036, 260, 2028, 480, 1198,...","[2263, 2822, 81, 178, 1495, 274, 26, 29, 88, 1..."
4,5,"[2987, 2333, 1175, 2337, 1535, 1392, 866, 2770...","[1106, 575, 466, 1108, 253, 1909, 1817, 709, 1..."


Add a new feature 'CBF' which will have binary values 


*   1 - this movie is in the list of recommended movies generated by the content based filtering algorithm
*   0 - this movie is not in the list of recommended movies generated by the content based filtering algorithm



In [57]:
# set CBF as 0 by default 
main['CBF'] = 0
main.head()

Unnamed: 0,user_id,movie_id,rating,CBF
0,1,1193,5,0
1,1,661,3,0
2,1,914,3,0
3,1,3408,4,0
4,1,2355,5,0


In [58]:
counter = 0

for index, row in main.iterrows(): 
  if (index % 100000 == 0): 
    print(index)
  user = int(row['user_id'])
  movie = int(row['movie_id'])

  movies_recommended_for_user = list(positive_ratings[positive_ratings['user_id'] == user]['CBF'])
  if (movie in movies_recommended_for_user):
    main.at[index, 'CBF'] = 1
    counter += 1

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000


In [59]:
print(counter)

0


In [31]:
main.head()

Unnamed: 0,user_id,movie_id,rating,CBF
0,1,1193,5,0
1,1,661,3,0
2,1,914,3,0
3,1,3408,4,0
4,1,2355,5,0


In [32]:
main['CBF'].value_counts()

0    1000209
Name: CBF, dtype: int64

# Feature 2: Collaborative Filtering

## Collaborative Filtering Engine

In [62]:
reader = Reader()
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

svd = SVD()

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv = 10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8630  0.8690  0.8630  0.8668  0.8646  0.8661  0.8672  0.8675  0.8643  0.8663  0.8658  0.0019  
MAE (testset)     0.6768  0.6819  0.6769  0.6809  0.6774  0.6796  0.6818  0.6790  0.6771  0.6791  0.6791  0.0019  
Fit time          54.69   50.69   50.65   50.65   50.85   51.01   50.73   51.97   50.71   52.05   51.40   1.21    
Test time         1.67    1.63    1.61    1.62    2.77    1.64    2.53    1.65    1.62    2.81    1.95    0.49    


{'test_rmse': array([0.8629891 , 0.8690223 , 0.86304055, 0.86684521, 0.86464375,
        0.86614412, 0.86724455, 0.86750302, 0.86425468, 0.86633886]),
 'test_mae': array([0.67683727, 0.68186612, 0.67689372, 0.68094434, 0.67737252,
        0.67964311, 0.68179128, 0.67902747, 0.67708388, 0.67913192]),
 'fit_time': (54.6890983581543,
  50.68660879135132,
  50.650737047195435,
  50.65039587020874,
  50.845054149627686,
  51.01071810722351,
  50.7323682308197,
  51.96642088890076,
  50.710285902023315,
  52.05427312850952),
 'test_time': (1.6722769737243652,
  1.6343700885772705,
  1.6094567775726318,
  1.6170730590820312,
  2.771545171737671,
  1.6354148387908936,
  2.5269908905029297,
  1.6509969234466553,
  1.6170470714569092,
  2.808896780014038)}

In [63]:
trainset = data.build_full_trainset()
svd.fit(trainset)
testset = trainset.build_anti_testset()
predictions = svd.test(testset)

In [64]:
def predict_rating_CF(user_id, movie_id, true_rating): 
  return svd.predict(user_id, movie_id, true_rating)

In [65]:
main['CF'] = 0

for index, row in main.iterrows():
    user = row['user_id']
    movie = row['movie_id']
    rating = row['rating']
    predicted_rating = predict_rating_CF(user, movie, rating)
    predicted_rating = predicted_rating[3]
    
    main.at[index,'CF'] = predicted_rating

In [66]:
main.head()

Unnamed: 0,user_id,movie_id,rating,CBF,CF
0,1,1193,5,0,4
1,1,661,3,0,3
2,1,914,3,0,4
3,1,3408,4,0,4
4,1,2355,5,0,4
