In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from nltk.stem import PorterStemmer

# Demographic Filtering

Load data

In [2]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

Merge dataframes

In [3]:
movies = movies.merge(credits, on='title')

Keep necessary columns

In [4]:
movies_demographic = movies[['movie_id', 'title', 'vote_average', 'vote_count', 'genres']]

Handle missing values

In [5]:
movies_demographic.dropna(subset=['vote_average', 'vote_count'], inplace=True)

Calculate C and m for demographic filtering

In [6]:
C_demographic = movies_demographic['vote_average'].mean()
m_demographic = movies_demographic['vote_count'].quantile(0.9)

Filter movies

In [7]:
q_movies_demographic = movies_demographic.copy().loc[movies_demographic['vote_count'] >= m_demographic]

Define weighted rating function for demographic filtering

In [8]:
def weighted_rating_demographic(x, m=m_demographic, C=C_demographic):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

Calculate score for demographic filtering

In [9]:
q_movies_demographic['score'] = q_movies_demographic.apply(weighted_rating_demographic, axis=1)

Sort movies based on score for demographic filtering

In [10]:
q_movies_demographic = q_movies_demographic.sort_values('score', ascending=False)

Save demographic filtering results

In [11]:
pickle.dump(q_movies_demographic, open('artifacts/movie_demographic.pkl', 'wb'))

# Content-based filtering

Keep important columns

In [12]:
movies_content = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average', 'vote_count']]

Handle missing values

In [13]:
movies_content.dropna(inplace=True)

Convert string representations to lists

In [14]:
def convert(text):
    return [item['name'] for item in ast.literal_eval(text)]

Handle missing values

In [15]:
movies_content.dropna(inplace=True)

Convert string representations to lists

In [16]:
def convert(text):
    return [item['name'] for item in ast.literal_eval(text)]

In [17]:
movies_content['genres'] = movies_content['genres'].apply(convert)
movies_content['keywords'] = movies_content['keywords'].apply(convert)

In [18]:
def convert_cast(text):
    return [item['name'] for i, item in enumerate(ast.literal_eval(text)) if i < 3]

In [19]:
movies_content['cast'] = movies_content['cast'].apply(convert_cast)

In [20]:
def fetch_director(text):
    return [item['name'] for item in ast.literal_eval(text) if item['job'] == 'Director']

In [21]:
movies_content['crew'] = movies_content['crew'].apply(fetch_director)

Process overview

In [22]:
movies_content['overview'] = movies_content['overview'].apply(lambda x: x.split())

Remove spaces in names

In [23]:
def remove_space(L):
    return [i.replace(" ", "") for i in L]

movies_content['cast'] = movies_content['cast'].apply(remove_space)
movies_content['crew'] = movies_content['crew'].apply(remove_space)
movies_content['genres'] = movies_content['genres'].apply(remove_space)
movies_content['keywords'] = movies_content['keywords'].apply(remove_space)

Concatenate all features into 'tags'

In [24]:
movies_content['tags'] = movies_content['overview'] + movies_content['genres'] + movies_content['keywords'] + movies_content['cast'] + movies_content['crew']

Drop unnecessary columns

In [25]:
new_df_content = movies_content[['movie_id', 'title', 'tags']]

Convert list to string

In [26]:
new_df_content['tags'] = new_df_content['tags'].apply(lambda x: " ".join(x))

Convert to lowercase

In [27]:
new_df_content['tags'] = new_df_content['tags'].str.lower()

Stemming

In [28]:
ps = PorterStemmer()
new_df_content['tags'] = new_df_content['tags'].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))

Text Vectorization

In [29]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vector_content = cv.fit_transform(new_df_content['tags']).toarray()

Cosine Similarity

In [30]:
similarity_content = cosine_similarity(vector_content)

Save processed data and similarity matrix for content-based filtering

In [31]:
pickle.dump(new_df_content, open('artifacts/movie_list_content.pkl', 'wb'))
pickle.dump(similarity_content, open('artifacts/similarity_content.pkl', 'wb'))