In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

In [2]:
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

## General top-N Recommendation


In [3]:
c = metadata['vote_average'].mean()
m = metadata['vote_count'].quantile(0.90)
selected_movies = metadata.copy().loc[metadata['vote_count'] >= m]
selected_movies.shape

(4555, 24)

In [4]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * c)

selected_movies['score'] = selected_movies.apply(weighted_rating, axis=1)
selected_movies = selected_movies.sort_values('score', ascending=False)

#Print the top 15 movies
selected_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


## Content based recommendation

In [5]:
#I used 5000 first movies due to the computation cost
metadataCB = metadata[:5000]
indices = pd.Series(metadataCB.index, index=metadataCB['title']).drop_duplicates()

### similarity based on overview

In [6]:
metadataCB['overview'] = metadataCB['overview'].fillna('')

# compute similarity matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(metadataCB['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadataCB['overview'] = metadataCB['overview'].fillna('')


### similarity based on features

In [7]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadataCB['id'] = metadataCB['id'].astype('int')
metadataCB = metadataCB.merge(credits, on='id')
metadataCB = metadataCB.merge(keywords, on='id')

# parse the stringified features
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadataCB[feature] = metadataCB[feature].apply(literal_eval)

# extract director name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# extract a list of other features
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

metadataCB['director'] = metadataCB['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadataCB[feature] = metadataCB[feature].apply(get_list)

# make all the words lower case and remove spaces to prevent ambiguity
# (Example) jam in 'traffic jam' is not equal to jam in 'bread jam'
def clean_data(x):
    # checks for ['cast', 'keywords', 'genres'] features
    if isinstance(x, list): 
        return [str.lower(i.replace(" ", "")) for i in x]
    # checks for director
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadataCB[feature] = metadataCB[feature].apply(clean_data)


def aggregateAll(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

metadataCB['aggregateAll'] = metadataCB.apply(aggregateAll, axis=1)

# compute similarity matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadataCB['aggregateAll'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadataCB['id'] = metadataCB['id'].astype('int')


### results

In [8]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return metadata['title'].iloc[movie_indices]

In [10]:
print('recommendation based on overview:')
get_recommendations('Toy Story', cosine_sim)

recommendation based on overview:


2997              Toy Story 2
1071    Rebel Without a Cause
3057          Man on the Moon
485                    Malice
1932                Condorman
448         For Love or Money
1032            The Sunchaser
2157        Indecent Proposal
3252          Bound for Glory
4078                Losin' It
Name: title, dtype: object

In [11]:
print('recommendation based on features:')
get_recommendations('Toy Story', cosine_sim2)

recommendation based on features:


3012        Sweet and Lowdown
3324               Bloodsport
1730        The Odd Couple II
1123         The Ruling Class
1429         Cats Don't Dance
734     Vermont Is for Lovers
405      Addams Family Values
581                   Aladdin
608            The Aristocats
702       Switchblade Sisters
Name: title, dtype: object