# Chapter 3 : Building an IMDB Top 250 Clone with Pandas

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('data/movies_metadata.csv',low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Building the simple recommender is fairly straightforward. The steps are as follows:
1. Choose a metric (or score) to rate the movies on
2. Decide on the prerequisites for the movie to be featured on the chart
3. Calculate the score for every movie that satisfies the conditions
4. Output the list of movies in decreasing order of their scores

<image src='data/IMDB Formula.PNG'>

In [4]:
#Calculate M to be number of votes garnered by the 80th percentile movie
m = df['vote_count'].quantile(0.80)
m

50.0

In [5]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
#filter over vote count and runtime
conditioned_df = df[df['vote_count'] >= m]
conditioned_df = conditioned_df[(conditioned_df['runtime'] >= 45) & (conditioned_df['runtime'] <=300) ]
conditioned_df.shape

(8963, 24)

In [7]:
#calculate C
C = df['vote_average'].mean()
C

5.618207215133889

In [8]:
def Cal_WR(x,C=C,m=m):
    v = x['vote_count']
    R = x['vote_average']
    WR =  (v / (v+m) * R) + (m/(m+v) )*C
    return WR

In [13]:
conditioned_df['score'] = conditioned_df.apply(Cal_WR,axis=1)
conditioned_df.sort_values(by='score',ascending=False)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,score
10309,False,,13200000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,19404,tt0112870,hi,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",...,1.000000e+08,190.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,Come... Fall In Love,Dilwale Dulhania Le Jayenge,False,9.1,661.0,8.855148
314,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,2.834147e+07,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,8.482863
834,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.thegodfather.com/,238,tt0068646,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",...,2.450664e+08,175.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,8.476278
40251,False,,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 16, ...",https://www.funimationfilms.com/movie/yourname/,372058,tt5311514,ja,君の名は。,High schoolers Mitsuha and Taki are complete s...,...,3.552983e+08,106.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Your Name.,False,8.5,1030.0,8.366584
12481,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",http://thedarkknight.warnerbros.com/dvdsite/,155,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,...,1.004558e+09,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,8.289115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17708,False,"{'id': 225961, 'name': 'Birdemic Collection', ...",0,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",http://www.birdemic.com/,40016,tt1316037,en,Birdemic: Shock and Terror,A platoon of eagles and vultures attacks the r...,...,0.000000e+00,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Why did the eagles and vultures attack?,Birdemic: Shock and Terror,False,2.1,69.0,3.578238
12911,False,,25000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",http://www.disastermovie.net/,13805,tt1213644,en,Disaster Movie,"In DISASTER MOVIE, the filmmaking team behind ...",...,1.410928e+07,87.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Your favorite movies are going to be destroyed.,Disaster Movie,False,3.1,250.0,3.519701
11557,False,,20000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9760,tt0799949,en,Epic Movie,"When Edward, Peter, Lucy and Susan each follow...",...,8.686556e+07,86.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,We know it's big. We measured.,Epic Movie,False,3.2,334.0,3.514871
3471,False,,44000000,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",,5491,tt0185183,en,Battlefield Earth,"In the year 3000, man is no match for the Psyc...",...,2.140000e+07,118.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Take Back The Planet,Battlefield Earth,False,3.0,259.0,3.423658


# Build Knowledge Based Recommender
1. Ask the user for the genres of movies he/she is looking for
2. Ask the user for the duration
3. Ask the user for the timeline of the movies recommended
4. Using the information collected, recommend movies to the user that have a high
weighted rating (according to the IMDB formula) and that satisfy the preceding
conditions

In [14]:
df = df[['title','genres', 'release_date', 'runtime', 'vote_average',
'vote_count']]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [15]:
df['release_date'] = pd.to_datetime(df['release_date'],errors='coerce')
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [17]:
# Helper function to convert NaT to 0 and all other years to integers.
def convert_int(x):
    try :
        return int(x)
    except : 
        return 0

df['year'] = df['year'].apply(convert_int)


In [18]:
df.drop('release_date',axis=1,inplace=True)
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


# Working on Genres Columns 

In [21]:
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [25]:
from ast import literal_eval 
literal_eval(df.iloc[0]['genres'])[0]['name']

'Animation'

In [26]:
df['genres'] = df['genres'].fillna('[]')

df['genres'] = df['genres'].apply(literal_eval)
df['genres']

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
45461    [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
45462                        [{'id': 18, 'name': 'Drama'}]
45463    [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464                                                   []
45465                                                   []
Name: genres, Length: 45466, dtype: object

In [27]:
df['genres'] = df['genres'].apply(lambda x : [i['name'] for i in x] if isinstance(x,list) else [])
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [29]:
#Create a new feature by exploding genres
s = df.apply(lambda x:pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
#Name the new feature as 'genre'
s.name = 'genre'
#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
gen_df = df.drop('genres', axis=1).join(s)
#Print the head of the new gen_df
gen_df.head()


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


# The build_chart function
1. Get user input on their preferences
2. Extract all movies that match the conditions set by the user
3. Calculate the values of m and C for only these movies and proceed to build the
chart as in the previous section

In [43]:
# def build_chart(gen_df,percentile=0.8):
#     print("Input preferred genre")
#     genre = input()

#     print("Input Shortest Duration")
#     low_time = int(input())

#     print("Input Longest Duration")
#     high_time = int(input())

#     print("Input Earliest Year")
#     low_year = int(input())
    
#     print("Input Latest Year")
#     high_year = int(input())

#     movies = gen_df.copy()
#     movies = movies[(movies['genre'] == genre)& (movies['runtime'] >= low_time)& 
#                     (movies['runtime'] <= high_time) & (movies['year'] >= low_year)& 
#                     (movies['year'] <= high_year) ]
    
#     C = movies['vote_average'].mean()
#     m = movies['vote_count'].quantile(percentile)
#     #Only consider movies that have higher than m votes. Save this in a newdataframe q_movies
#     q_movies = movies.copy().loc[movies['vote_count'] >= m]
#     #Calculate score using the IMDB formula
#     q_movies['score'] = q_movies.apply(lambda x:(x['vote_count']/(x['vote_count']+m) * x['vote_average'])+ (m/(m+x['vote_count']) * C),axis=1)
#     #Sort movies in descending order of their scores
#     q_movies = q_movies.sort_values('score', ascending=False)
#     return q_movies     

In [46]:
def build_chart(gen_df, percentile=0.8):
    #Ask for preferred genres
    print("Input preferred genre")
    genre = input()
    #Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())
    #Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())
    #Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    #Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies
    movies = gen_df.copy()
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) &
    (movies['runtime'] >= low_time) &
    (movies['runtime'] <= high_time) &
    (movies['year'] >= low_year) &
    (movies['year'] <= high_year)]
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x:(x['vote_count']/(x['vote_count']+m) * x['vote_average'])+ (m/(m+x['vote_count']) * C),axis=1)
    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    return q_movies

In [48]:
build_chart(gen_df).head()

# END