# THE KNOWLEDGE-BASED RECOMMENDER

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/movies_metadata.csv')
df.columns

  df = pd.read_csv('../data/movies_metadata.csv')


Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [14]:
# Only keep those features that we require
df = df[['title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [15]:
# Convert release_date into pandas datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Extract year from the datetime
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [16]:
# Helper function to convert NaT to 0 and all other years to integers.
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

# Apply convert_int to the year feature
df['year'] = df['year'].apply(convert_int)

In [17]:
# Drop the release_date column
df = df.drop('release_date', axis=1)

# Display the dataframe
print(df.head())
print(df.info())

                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                              genres  runtime  vote_average  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...     81.0           7.7   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...    104.0           6.9   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...    101.0           6.5   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    127.0           6.1   
4                     [{'id': 35, 'name': 'Comedy'}]    106.0           5.7   

   vote_count  year  
0      5415.0  1995  
1      2413.0  1995  
2        92.0  1995  
3        34.0  1995  
4       173.0  1995  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  --

## Genres

In [18]:
# Print genres of the first movie
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [19]:
# Import the literal_eval function from ast
from ast import literal_eval

# Define a stringified list and output its type
a = "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"
print(type(a))

# Apply literal_eval and output type
b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [20]:
# Convert all NaN into stringified empty lists
df.loc[:,'genres'] = df['genres'].fillna('[]')

# Apply literal_eval to convert to the list object
df.loc[:, 'genres'] = df['genres'].apply(literal_eval)

# Convert list of dictionaries to a list of strings
df.loc[:, 'genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [21]:
# Create a new feature by exploding genres
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

# Name the new feature as 'genre'
s.name = 'genre'

# Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
gen_df = df.drop('genres', axis=1).join(s)

# Print the head of the new gen_df
print(gen_df.head())

       title  runtime  vote_average  vote_count  year      genre
0  Toy Story     81.0           7.7      5415.0  1995  Animation
0  Toy Story     81.0           7.7      5415.0  1995     Comedy
0  Toy Story     81.0           7.7      5415.0  1995     Family
1    Jumanji    104.0           6.9      2413.0  1995  Adventure
1    Jumanji    104.0           6.9      2413.0  1995    Fantasy


In [22]:
def build_chart(gen_df, percentile = 0.8):

    genre = 'Animation'
    low_time = 30
    high_time = 120
    low_year = 1990
    high_year = 2005

    movies = gen_df.copy()

    # Filter based on the condition
    movies = movies[(movies['genre'] == genre) &
                    (movies['runtime'] >= low_time) &
                    (movies['runtime'] <= high_time) &
                    (movies['year'] >= low_year) &
                    (movies['year'] <= high_year)]

    # Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)

    # Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]

    # Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)

    # Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)

    return q_movies

# Generate the chart for top animation movies and display top 5.
build_chart(gen_df).head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
9698,Howl's Moving Castle,119.0,8.2,2049.0,2004,Animation,7.994823
359,The Lion King,89.0,8.0,5520.0,1994,Animation,7.926672
0,Toy Story,81.0,7.7,5415.0,1995,Animation,7.6375
6232,Finding Nemo,100.0,7.6,6292.0,2003,Animation,7.549423
546,The Nightmare Before Christmas,76.0,7.6,2135.0,1993,Animation,7.4605


In [23]:
df.to_csv('../data/metadata_clean.csv', index=False)