In [None]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from ast import literal_eval 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Importing the Dataset
data = pd.read_csv('C:/Pocs/Python/PythonRecomendationSystem/movies_metadata.csv', low_memory = False)

In [3]:
# Building the Content Based Recommender
# Calculating the average vote rate
vote_rate = data['vote_average'].mean()

In [4]:
# Calculating the minimum number of votes to be in the chart
min_votes = data['vote_count'].quantile(0.90)

In [5]:
# Filtering out all qualified movies into a new DataFrame
new_moviesdf = data.copy().loc[data['vote_count'] >= min_votes]

In [6]:
# Computing the score(rating) of each movie
def score(x, min_votes = min_votes, vote_rate = vote_rate):
    vote_cnt = x['vote_count']
    vote_avg = x['vote_average']
    # Calculation based on the IMDB formula
    return (vote_cnt/(vote_cnt+min_votes) * vote_avg) + (min_votes/(min_votes+vote_cnt) * vote_rate)

In [7]:
# Defining a new feature 'score' and calculate its value
new_moviesdf['score'] = new_moviesdf.apply(score, axis=1)

In [8]:
# Sorting the movies based on score calculated above
new_moviesdf = new_moviesdf.sort_values('score', ascending=False)

In [9]:
# Load keywords and credits
credits = pd.read_csv('C:/Pocs/Python/PythonRecomendationSystem/credits.csv')
keywords = pd.read_csv('C:/Pocs/Python/PythonRecomendationSystem/keywords.csv')

In [10]:
# Remove rows with bad IDs.
data = data.drop([19730, 29503, 35587])

# Convert IDs to int (Merging Purpose)
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
data['id'] = data['id'].astype('int')

# Merge keywords and credits into main 'data' dataframe
data = data.merge(credits, on='id')
data = data.merge(keywords, on='id')

In [11]:
# Parsing the string features into their corresponding python objects
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    data[feature] = data[feature].apply(literal_eval)

In [12]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [13]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    # Return empty list in case of missing/malformed data
    return []

In [14]:
# Define new director, cast, genres and keywords features that are in a suitable form.
data['director'] = data['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    data[feature] = data[feature].apply(get_list)

In [15]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [16]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    data[feature] = data[feature].apply(clean_data)

In [17]:
def merge(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [18]:
data['merge'] = data.apply(merge, axis=1)

In [19]:
# Create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['merge'])

In [21]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

MemoryError: Unable to allocate 16.2 GiB for an array with shape (46628, 46628) and data type float64

In [None]:
# Reset index of your main DataFrame and construct reverse mapping as before
data = data.reset_index()
indices = pd.Series(data.index, index = data['title'])

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def recommend_movies(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [None]:
recommend_movies('The Godfather')

3012                       Toy Story 2
15444                      Toy Story 3
29156                  Superstar Goofy
25951       Toy Story That Time Forgot
22064             Toy Story of Terror!
3324                 Creature Comforts
25949                  Partysaurus Rex
27560                            Anina
43059    Dexter's Laboratory: Ego Trip
27959                    Radiopiratene
Name: title, dtype: object