In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

#  upload the datasets 


In [13]:
rating_path = r'C:\Users\nirro\Desktop\machine learning\interviews\data tapas\ratings.csv'
rating_data = pd.read_csv(rating_path)
movies_metadata_path = r'C:\Users\nirro\Desktop\machine learning\interviews\data tapas\movies_metadata.csv'
movies_metadata_data = pd.read_csv(movies_metadata_path, usecols=['id', 'title', 'genres'])


In [14]:
movies_metadata_data.head()

Unnamed: 0,genres,id,title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Father of the Bride Part II


In [15]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


#  take a sub-sample of the rating data to make the process easy 


use a sample of 30%
because different users have different amount of voting , in order to get representation of them all
will use group by and after this take a sample of each user fraction

In [16]:
group_userid_rating_data = rating_data.groupby("userId")
sub_rating_data_30 = group_userid_rating_data.apply(lambda x: x.sample(frac=0.3))
s = array = np.arange(len(sub_rating_data_30))
sub_rating_data_30=sub_rating_data_30.set_index([s])

In [17]:
sub_rating_data_30.shape

(7802471, 4)

In [18]:
sub_rating_data_30.to_csv(r'C:\Users\nirro\Desktop\machine learning\interviews\data tapas\sub_rating_data_30.csv')


In [24]:
sub_rating_data_30_path = r'C:\Users\nirro\Desktop\machine learning\interviews\data tapas\sub_rating_data_30.csv'
sub_rating_data_30 = pd.read_csv(sub_rating_data_30_path).drop('Unnamed: 0', axis=1)

In [25]:
sub_rating_data_30.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,4226,4.0,1425942228
1,1,69844,5.0,1425942139
2,1,99114,4.0,1425941667
3,1,98809,0.5,1425942640
4,1,58559,4.0,1425942007


first will drop all users that have less then 10 ratings

# top 10 movies 

create a list of the top 10 movies
use group-by and aggregate function for this

In [33]:
group_rates_df = sub_rating_data_30.groupby(['movieId']).agg(
    mean_rating=('rating', 'mean'),
    median_rating=('rating','median'),
    count_rating=('rating', 'count')
).reset_index()

In [34]:
group_rates_df.head()

Unnamed: 0,movieId,mean_rating,median_rating,count_rating
0,1,3.891974,4.0,19824
1,2,3.236553,3.0,7827
2,3,3.174914,3.0,4668
3,4,2.877574,3.0,874
4,5,3.089495,3.0,4598


In [37]:
movies_metadata_data = movies_metadata_data.rename(columns={'id':'movieId'})
movies_metadata_data['movieId'] = movies_metadata_data['movieId'].astype(str)

# drop some rows with invalid id like '1997-08-20'
list_bad_movieId = []
for ind, num in enumerate(movies_metadata_data['movieId']):
    if '-' in num:
       list_bad_movieId.append(ind)
movies_metadata_data = movies_metadata_data.drop(list_bad_movieId, axis=0)
movies_metadata_data['movieId'] = movies_metadata_data['movieId'].astype(int)

## using IMDB's equation 
I saw that there is another method using IMDB's equation, IMDB's weighted rating (wr)
v is the number of votes for the movie;
m is the minimum votes required to be listed in the chart;
R is the average rating of the movie; And
C is the mean vote across the whole report

In [38]:
def calculate_rating_imdb(rating_data, quantile):
    C = rating_data['mean_rating'].mean()
    m = rating_data['count_rating'].quantile(quantile)
    # top_movies = rating_data.loc[rating_data['count_rating'] >= m].copy()

    v = rating_data['count_rating']
    R = rating_data['mean_rating']
    # Calculation based on the IMDB formula
    wr = (v/(v+m) * R) + (m/(m+v) * C)
    rating_data['total_rating_score'] = wr
    return rating_data.sort_values(by='total_rating_score', ascending=False)

group_rates_df = calculate_rating_imdb(rating_data=group_rates_df, quantile=0.90)
top_ten_movies_indices = list(group_rates_df.iloc[:10, 0])
top_ten_movies_names = movies_metadata_data.loc[movies_metadata_data['movieId'].isin(top_ten_movies_indices), 'title']
print(top_ten_movies_names)

286            Once Were Warriors
534          Sleepless in Seattle
2649      The Thomas Crown Affair
4020     The Million Dollar Hotel
8546              Murder She Said
11922              License to Wed
Name: title, dtype: object


assign the rating score values to movies_metadata_data

In [39]:
movies_metadata_data = movies_metadata_data.merge(group_rates_df[['movieId', 'total_rating_score']], 
                                                  how='left', on='movieId')

the two ways make different lists
I didn't considered the time-stemp variable
if a movie is old or new, a new movie can get a low score on 'count_rating_score'.

join the two tables by the id key

In [41]:
all_data = sub_rating_data_30.merge(movies_metadata_data, how='left', on='movieId')

In [None]:
# ----------- missing values ---------------
all_data_null = all_data.isnull().sum()
# there is a lot of missing values in gender and title columns.
# it was preferable to remove them in the beginning before the sub-sample but i will do it right now
all_data = all_data.dropna()


Next, write functions to extract the required information from genres.

In [44]:
def extract_genres(data):
    list_genres_words = []
    sub_list_genres_words = []
    chars_to_remove = ['{', '}', '[',']']
    list_bad_genres = []
    for ind, row in enumerate(data.genres):
        sc = set(chars_to_remove)
        string_to_list = ''.join([c for c in row if c not in sc])
        string_to_list = string_to_list.split(',')
        for item in string_to_list:
            if item == '':
                list_bad_genres.append(ind)
                continue
            item = item.split(':')
            item[0] = item[0].strip()
            item[0] = item[0].strip("'")
            item[1] = item[1].strip()
            item[1] = item[1].strip("'")
            if item[0] == 'name':
                sub_list_genres_words.append(item[1])

        list_genres_words.append(sub_list_genres_words)
        sub_list_genres_words = []
    for genres in list_genres_words:
        new = ' '.join(genres)
        sub_list_genres_words.append(new)
    data['genres'] = sub_list_genres_words
    return data

all_data = all_data.reset_index().drop('index', axis=1)
all_data = extract_genres(all_data)

In [46]:
all_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,title,total_rating_score
0,1,4226,4.0,1425942228,Comedy,Shriek If You Know What I Did Last Friday the ...,4.138367
1,1,58559,4.0,1425942007,Drama,Confession of a Child of the Century,4.153002
2,1,1968,4.0,1425942148,Drama Comedy Romance,Fools Rush In,3.789477
3,2,339,5.0,867041296,Comedy Drama,Night on Earth,3.497475
4,2,377,4.0,867041121,Horror,A Nightmare on Elm Street,3.473982


In [47]:
all_data.shape

(3428649, 7)

missing values

In [42]:
all_data_null = all_data.isnull().sum()


there is a lot of missing values in gender and title columns.
it was preferable to remove them in the beginning before the sub-sample but i will do it right now

In [43]:
all_data = all_data.dropna()

There are a few things that can be done to the training data that could quickly improve a recommender system.
1. Remove popular items from the training data.
(This is appropriate in cases where users can discover these items on their own, and may not find these recommendations useful).

2. Scale item ratings by the user’s value, such as average transaction value.
This can help a model learn to recommend items that lead to loyal or high-value customers.

in my case because i used hit rate evaluations i split the data-set as follows
train - will be the most favorite movie of each user.
test will be all the remaining movie that the user rated.
will evaluate using the top 10 recommended movies
first will drop all users that have less then 10 ratings

In [48]:
def drop_less_ten(data):
    count_group = data.groupby('userId').agg(num_count=('rating', 'count')).sort_values(by='num_count', ascending=False).reset_index()
    less_then_ten = count_group.loc[count_group.num_count < 10, :]
    less_then_ten_users_id = list(less_then_ten.userId)
    less_index = data.loc[data.userId.isin(less_then_ten_users_id), :].index.to_list()
    new_data = data.copy()
    new_data = new_data.drop(less_index, axis=0).reset_index().drop('index', axis=1)
    return new_data
all_data_less_then_ten = drop_less_ten(data=all_data)

In [49]:
all_data_less_then_ten.shape

(2838355, 7)

##  build the recommendations functions with movie title and rating 
##  Hybrid Recommender
I will try to build a simple hybrid recommender that brings together techniques in the content based and collaborative filter based engines.

model 1: 
This is how it will work:

Input: User ID and the Title of favorite Movie

Output: return Similar movies sorted by the best top 10 (rating in the similar group), in my case by average rating of that item.

model 2: 
This is how it will work:

Input: User ID and the genres of favorite Movie

Output: same as model 1, return Similar movies sorted by the best top 10 (rating in the similar group), in my case by average rating of that item.

it is possible of to sort on the basis of expected ratings by that particular user.

## TF-IDF matrix 

use scikit-learn built-in TfIdfVectorizer class that produces the TF-IDF matrix
step 1. Define a TF-IDF Vectorizer Object. 
Remove all english stop words such as 'the', 'a'

step2. Construct the required TF-IDF matrix by fitting and transforming the data
there is 6055 different words were used to describe the 2,285,612 different titles for each user,
of course there is duplicates (same movies for different users and same movies for different users )
im going to deal with duplicates later.

step 3.  compute a similarity score
With this matrix in hand, we can now compute a similarity score.
There are several candidates for this; such as the euclidean, the Pearson and the cosine similarity scores.
Different scores work well in different scenarios.

it is often a good idea to experiment with different metrics.

I used the TF-IDF vectorizer, calculating the dot product will directly give us the cosine similarity score.
Therefore,im using use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.


Compute the cosine similarity matrix

In [None]:
def get_similarity_matrix(data, column_name):
    tf_idf = TfidfVectorizer(stop_words='english')
    unique_column = data[column_name].unique()
    tf_idf_matrix = tf_idf.fit_transform(unique_column)
    cosine_sim_matrix = linear_kernel(tf_idf_matrix, tf_idf_matrix)
    # Output the shape of tf_idf_title_matrix.
    # print(tf_idf_matrix.shape)
    return cosine_sim_matrix

define a function that takes in a movie title and the rating scores as an input and outputs a list of the 10 recomended movies.

In [None]:
# define a function that takes in a movie title and the rating scores as an input and outputs a list of the 10 recomended movies.
def get_recommendations_by_item(item_column_name, item, cosine_sim, data):
    # mechanism to identify the index of a movie in my metadata DataFrame, given its title.
    indices = pd.DataFrame(data[[item_column_name]],index=data.index).drop_duplicates().reset_index().drop('index', axis=1)
    # Get the index of the movie that matches the title
    try:
        idx = indices.loc[indices[item_column_name] == item].index[0]
    except:
        return print('ValueError: ', item ,' is not in list, try different item')

    # Get the pairwsie similarity scores of all items with that item
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 20 most similar items
    top_sim_scores = sim_scores[1:21]

    # Get the items indices
    items_indices = [i[0] for i in top_sim_scores]
    items_score = [i[1] for i in top_sim_scores]

    # after getting the most similar items i sorted again by the 'total_rating_score' columns
    # what happens is i get the highest rating of movies within the similar movies
    top_items = indices.iloc[items_indices].copy()
    top_df = data.loc[data[item_column_name].isin(list(top_items.iloc[:,0]))].drop_duplicates(subset=[item_column_name])\
        .sort_values(by='total_rating_score', ascending=False)
    top_items_final = top_df['title']
    # Return the top 10 most similar items
    return top_items_final[0:10]

# top_ten_by_title_and_rates = get_recommendations_by_item(item='Confession of a Child of the Century',
#                                                          item_column_name='title' , cosine_sim=cosine_sim_matrix_title, 
#                                                          data=all_data)

## get recommendations by user 

creat a fuction that taks a user id and return one of the users favorite movies 

In [None]:
def get_user_favorite_movie(data, user_id):
    user_df = data.loc[data.userId == user_id, :].sort_values(by='rating', ascending=False).reset_index().drop('index', axis=1)
    favorite_movie = user_df.iloc[0,5]
    favorit_genres = user_df.iloc[0,4]
    return favorite_movie, favorit_genres
# get_user_favorite_movie(data=all_data, user_id=1)

get the recomendation of a user using the rating and title or genres

In [None]:
def get_recommendation_by_user(user, data, item_column_name):

    best_movie, best_genres = get_user_favorite_movie(data=data , user_id=user)
    if item_column_name == 'title':
        similarity_matrix = get_similarity_matrix(data=data, column_name=item_column_name)
        top_ten_by_movie = get_recommendations_by_item(item=best_movie, item_column_name=item_column_name,
                                                       cosine_sim=similarity_matrix, data=data)
        return top_ten_by_movie

    elif item_column_name == 'genres':
        similarity_matrix = get_similarity_matrix(data=data, column_name=item_column_name)
        top_ten_by_genres = get_recommendations_by_item(item=best_genres, item_column_name=item_column_name,
                                                        cosine_sim=similarity_matrix, data=data)
        return top_ten_by_genres


# movie, movie_tf = get_recommendation_by_user(user=2, data=all_data, item_column_name='title')
# movie2, movie_tf2 = get_recommendation_by_user(user=2, data=all_data, item_column_name='genres')

While the system has done a decent job of finding movies with similar plot descriptions,
the quality of recommendations is not that great.
it is possible to take not only the user favorite movie, 
but take all the movies a user watch or maybe take only the ones he liked.
the quality of my recommendation system would be increased with the usage of better metadata.

In [None]:
def get_recommendation_for_all(data, item):
    list_top_ten = []
    user_df = data.userId.unique()
    for user in user_df:
        top_ten = list(get_recommendation_by_user(user=user, data=data, item_column_name=item))
        list_top_ten.append(top_ten)
    user_df = pd.DataFrame(user_df, columns=['userId'])
    user_df['recommendation'] = list_top_ten
    return user_df
data_title_recommendation = get_recommendation_for_all(data=all_data_less_then_ten.iloc[:10000, :], item='title')
data_genres_recommendation = get_recommendation_for_all(data=all_data_less_then_ten.iloc[:10000, :], item='genres')

##  Evaluation Metrics for Recommender Systems 

For evaluating recommendation engines, we can use the following metrics

## Recall:
What proportion of items that a user likes were actually recommended
It is given by: recall = tp / tp + fn

tp represents the number of items recommended to a user that he/she likes and tp+fn represents the total items that a user likes
If a user likes 5 items and the recommendation engine decided to show 3 of them, then the recall will be 0.6
Larger the recall, better are the recommendations


In [None]:
def calculate_recall(user_rating_data, results):
    tp = 0
    for item in results[0]:
        if item in user_rating_data:
            tp += 1
    user_recall = tp / len(user_rating_data)
    return user_recall

def calculate_recall_for_all(data_user, data_results,item):
    recall_list = []
    for user in data_results.userId:
        user_results = data_results.loc[data_results.userId == user,'recommendation'].to_list()
        if item == 'title':
            user_rate_data = data_user.loc[data_user.userId == user,'title'].to_list()
        elif item == 'genres':
            user_rate_data = data_user.loc[data_user.userId == user, 'title'].to_list()
        recall_score = calculate_recall(user_rate_data,user_results)
        recall_list.append(recall_score)
    data_results['recall'] = recall_list
    return data_results
data_title_recommendation = calculate_recall_for_all(data_user=all_data_less_then_ten, data_results=data_title_recommendation, item='title')
data_genres_recommendation = calculate_recall_for_all(data_user=all_data_less_then_ten, data_results=data_genres_recommendation, item='genres')

In [None]:
print('mean recall with genres prediction', data_genres_recommendation.recall.mean())

In [None]:
print('mean recall with title prediction', data_title_recommendation.recall.mean())

## Precision:

Out of all the recommended items, how many did the user actually like?
It is given by:  precision = tp / tp + fp

again tp represents the number of items recommended to a user that he/she likes and tp+fp represents the total items recommended to a user
If 5 items were recommended to the user out of which he liked 4, then precision will be 0.8
Larger the precision, better the recommendations

our aim should be to maximize both precision and recall.

In [None]:
def calculate_precision(user_rating_data, results):
    tp = 0
    for item in results[0]:
        if item in user_rating_data:
            tp += 1
    precision = tp / len(results)
    return precision


def calculate_precision_for_all(data_user, data_results,item):
    precision_list = []
    for user in data_results.userId:
        user_results = data_results.loc[data_results.userId == user, 'recommendation'].to_list()
        if item == 'title':
            user_rate_data = data_user.loc[data_user.userId == user, 'title'].to_list()
        elif item == 'genres':
            user_rate_data = data_user.loc[data_user.userId == user, 'title'].to_list()
        precision_score = calculate_precision(user_rate_data, user_results)
        precision_list.append(precision_score)
    data_results['precision'] = precision_list
    return data_results

data_title_recommendation = calculate_precision_for_all(data_user=all_data_less_then_ten, data_results=data_title_recommendation, item='title')
data_genres_recommendation = calculate_precision_for_all(data_user=all_data_less_then_ten, data_results=data_genres_recommendation, item='genres')


In [None]:
print('mean precision with genres prediction', data_genres_recommendation.precision.mean())


In [None]:
print('mean precision with title prediction', data_title_recommendation.precision.mean())

## What is the pros and cons of using each model?

Advantages of first model:
it's obvious that the first model (from question 1) is more simple and less complex, which makes it easier to implement ,
another good thing about this model is for new users which we don't have data on them it will be a good start to use it.

Disadvantages of first model:
its a generic model, it will give the same prediction to every user regarding the user likes or dislikes.

Advantages of second model (without genres):
its more personalized then the first, its using the title of the movie user liked, but it uses similarity (in this case of the vecture of words),
not really an accurate,

Disadvantages of second model (with genres):
again can give wrong output that don't relate to the user.

Advantages of third model (without genres):
its more personalized then the first and the second, its using the genres of the movie user liked,
more accurate then the rest.
because we used a hybrid model that after finding the most similar movies by gender it sort and takes the top rating in them

Disadvantages of third model (with genres):
like all of them we can get more personal data to make them more accurate, the third model is still very simple.
i didn't get into taking data from similar users for predictions, i think this is very importent if you want to recommend new items to user

another important issue is that i used only used one movie from each user, its more simple and quick that way, but if the company have more robust computers it's better to use all of the user history.


## is it justified? Should the company use a static model (from question 1) or a more advanced and personal model like in question 3?
i think the first model is a start model for new user and the last model is more advanced for old users, still they both need more tuning.

## How would you monitor and measure the effectiveness of these models once they go live?
of course we need to update the user data every time we want to recommend in a new item.
i would say precision is a good kpi to start with. 