In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
!pip3 install sklearn

  Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed joblib-1.1.0 scikit-learn-1.0.2 scipy-1.7.3 sklearn-0.0 threadpoolctl-3.1.0
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
movies = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/movies.csv')
tags = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/tags.csv')
ratings = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/ratings.csv')
users = pd.read_csv('https://raw.githubusercontent.com/nchichilidze/RS-with-GE/main/preprocessed_movielens_1m/users.csv')

movies = movies.iloc[: , 1:]
tags = tags.iloc[: , 1:]
ratings = ratings.iloc[: , 1:]
users = users.iloc[: , 1:]

In [4]:
main = pd.DataFrame(ratings)

# Feature 1: Content-Based Filtering 

Get all the movies that a user likes - and use content-based filtering to generate a list of similar movies that the user might also like 

The movies get grouped by analysing the tags associated with each movie (tags are a combination of user comments and movie genres)

Cast the columns in movies and tags

In [5]:
tags = tags.astype({'tags' : 'string'})
tags = tags.astype({'title' : 'string'})
tags = tags.astype({'genres' : 'string'})
tags.dtypes

movie_id     int64
tags        string
title       string
genres      string
year         int64
dtype: object

In [6]:
movies = movies.astype({'title' : 'string'})
movies = movies.astype({'genres' : 'string'})

movies.dtypes

movie_id     int64
title       string
genres      string
year         int64
dtype: object

In [7]:
#importing necessary libraries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [8]:
# an example of a tag
tag_example = tags['tags'][0]
tag_example

"pixar pixar pixar animation pixar animated fun toy toys pixar rated g pixar pixar national film registry time travel pixar pixar funny imdb top  animation very good ya boy pixar time travel animation erlend s dvds cgi disney family pixar toys bright daring rescues fanciful heroic mission humorous light rousing toys come to life unlikely friendships warm witty pixar the boys almost favorite é ä é animation children disney pixar clever want to see again cartoon disney d computer animation disney animated feature pixar animation want buddy movie animation pixar tim allen tom hanks animation disney pixar toys computer animation cartoon pixar animation computer animation pixar toys tumey s to see again tumey s vhs classic disney engaging avi buy action figure action figures buzz lightyear cg animation toy toys woody disney pixar tim allen tom hanks animation pixar animation kids movie pixar tim allen time travel animation cgi adventure animation children comedy animation pixar lots of hear

In [9]:
# make sure that every movie has a tag
tags.isna().sum()

movie_id    0
tags        0
title       0
genres      0
year        0
dtype: int64

In [10]:
# how many ratings we have 
print(len(ratings['movie_id']))

# how many unique movies we have in ratings
movies_in_ratings = ratings['movie_id'].drop_duplicates()
print(len(movies_in_ratings))

1000209
3706


In [11]:
# it's redundant to draw information about movies that are not in the ratings df, so I will drop them 
print("Tags shape before dropping extra movies: " + str(tags.shape))
tags = tags[tags.movie_id.map(lambda x: np.isin(x, movies_in_ratings).all())]
print("Tags shape after dropping extra movies: " + str(tags.shape))
# drop any movie that we have twice
tags = tags.drop_duplicates(subset=['title'])
print("Tags shape after dropping duplicate movies: " + str(tags.shape))

Tags shape before dropping extra movies: (3883, 5)
Tags shape after dropping extra movies: (3706, 5)
Tags shape after dropping duplicate movies: (3664, 5)


## Content-Based Filtering Engine

In [12]:
tags = tags.reset_index()
tags = tags.drop(columns=['index'])

In [13]:
tags.tail()

Unnamed: 0,movie_id,tags,title,genres,year
3659,3948,ben stiller comedy hilarious owen wilson ben s...,Meet the Parents,comedy,2000
3660,3949,ass to ass heroin psychology depressing drugs ...,Requiem for a Dream,drama,2000
3661,3950,colin farrell drama drama drama drama drama,Tigerland,drama,2000
3662,3951,in netflix queue in netflix queue r drama dram...,Two Family House,drama,2000
3663,3952,politics vice president washington dc gary old...,"Contender, The",drama thriller,2000


When building the matrix, the issue is that since the length of the tags dataframe is not the same as the maximum index of movie in the dataframe

In [14]:
print("max movie id: " + str(tags['movie_id'].max()))
print("length of the tags dataframe: " + str(len(tags['movie_id'])))

max movie id: 3952
length of the tags dataframe: 3664


In [27]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
matrix = tf.fit_transform(tags['tags'])

cosine_similarities = linear_kernel(matrix,matrix)
movie_title = tags['title']
indices = pd.Series(tags.index, index=tags['title'])

def recommend_movies(original_title):

    idx = indices[original_title]
    sim_scores = list(enumerate(cosine_similarities[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]
    # titles = movie_title.iloc[movie_indices]
    return movie_indices
    # return titles, movie_indices

In [16]:
t, i  = recommend_movies('Clueless')
print(t)

27                                           Persuasion
2833                                     Mansfield Park
786                                                Emma
16                                Sense and Sensibility
62                                        Two if by Sea
113                                        If Lucy Fell
278                                  Nina Takes a Lover
285                                   Perez Family, The
286                          Pyromaniac's Love Story, A
368                                          Speechless
433                                          Favor, The
552                                   Naked in New York
761     Rendezvous in Paris (Rendez-vous de Paris, Les)
1341                      Beautician and the Beast, The
1343                                      Hotel de Love
Name: title, dtype: string


Clueless is based on 'Emma' by Jane Austen, Persuasion is also a Jane Austen book, therefore I think CBF works quite well :) 

## Generate similar movies

In [17]:
tags.tail()

Unnamed: 0,movie_id,tags,title,genres,year
3659,3948,ben stiller comedy hilarious owen wilson ben s...,Meet the Parents,comedy,2000
3660,3949,ass to ass heroin psychology depressing drugs ...,Requiem for a Dream,drama,2000
3661,3950,colin farrell drama drama drama drama drama,Tigerland,drama,2000
3662,3951,in netflix queue in netflix queue r drama dram...,Two Family House,drama,2000
3663,3952,politics vice president washington dc gary old...,"Contender, The",drama thriller,2000


In [18]:
# any movies in ratings that we don't have tags for? 

print(list(set(ratings['movie_id']) - set(tags['movie_id'])))

movies_in_tags = tags['movie_id'].drop_duplicates()


[2561, 2820, 2565, 2059, 3723, 3598, 915, 1941, 2455, 3096, 2078, 2719, 2848, 3616, 3744, 2212, 2085, 2729, 1460, 3126, 2364, 957, 2367, 1344, 3784, 2633, 2634, 2764, 3404, 3022, 2512, 1873, 2389, 2645, 2135, 2136, 2398, 3935, 3947, 3823, 2553, 2430]


for each movie in ratings, generate a list of n similar movies

In [19]:
similar_movies_map = {} 
no_similar_movies_count = 0

# getting rid of any movies we don't have tags for 
movies_in_ratings = list(set(movies_in_ratings) and set(movies_in_tags))

for movie in movies_in_ratings: 
  title = str(tags[tags['movie_id'] == movie]['title'].values[0])
  # actual_index = tags.index[tags['movie_id'] == movie]
  t, ind = recommend_movies(title)
  similar_movies_map[movie] = ind;
  if (len(ind) == 0): 
    no_similar_movies_count += 1

print("No similar movies found for " + str(no_similar_movies_count) + " movies")

No similar movies found for 0 movies


## Add feature to dataframe

get favourite movies by each user

In [20]:
main.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [23]:
!pip3 install tqdm
from tqdm import tqdm

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
users_in_ratings = main['user_id'].drop_duplicates()
users_in_ratings = list(users_in_ratings.values)

user_similar_movies_map = {}
no_of_users = len(users_in_ratings) 
print("all users: " + str(no_of_users))
for i in range(no_of_users): 
  user = int(users_in_ratings[i])
  print(user)
  # get the movies rated by the user 
  movies_rated_by_user = main.loc[main['user_id'] == int(user)]
  # pick the movies that the user have a positive (>3) rating to
  movies_rated_by_user = movies_rated_by_user[movies_rated_by_user['rating'] > 3]
  movies_rated_by_user = movies_rated_by_user['movie_id'] 
  
  # getting rid of any movies we don't have tags for 
  movies_rated_by_user = list(set(movies_rated_by_user) and set(movies_in_tags))

  # for each positively rated movie, generate similar movies 
  similar_movies = []
  for movie_id in movies_rated_by_user: 
    title = str(tags[tags['movie_id'] == movie_id]['title'].values[0])
    recommended_movies = recommend_movies(title)
    similar_movies += list(recommended_movies)
  
  user_similar_movies_map[user] = similar_movies

In [31]:
def save_dict_to_file(dic):
    f = open('dict.txt','w')
    f.write(str(dic))
    f.close() 

save_dict_to_file(user_similar_movies_map)

In [34]:
import json 

a_file = open("data.json", "w")
json.dump(user_similar_movies_map, a_file)
a_file.close()

a_file = open("data.json", "r")
output = a_file.read()
print(output)

Add a new feature 'CBF' which will have binary values 


*   1 - this movie is in the list of recommended movies generated by the content based filtering algorithm
*   0 - this movie is not in the list of recommended movies generated by the content based filtering algorithm



In [None]:
# set CBF as 0 by default 
main['CBF'] = 0
main.head()

In [None]:
counter = 0

for index, row in main.iterrows(): 
  user = int(row['user_id'])
  movie = int(row['movie_id'])

  movies_recommended_for_user = user_similar_movies_map.get(user)
  if (movie in movies_recommended_for_user):
    main.at[index, 'CBF'] = 1
    counter += 1

# Feature 2: Collaborative Filtering

## Collaborative Filtering Engine