In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/movies_metadata.csv')

# Print all the features/columns of the DataFrame

df.columns

  df = pd.read_csv('data/movies_metadata.csv')


Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [2]:
# Only keep features we require

df = df[['title', 'genres', 'release_date',
         'runtime', 'vote_average', 'vote_count']]

df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [7]:
# Convert release_date into pandas datetime formate
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Extract year from the datetime
df['year'] = df['release_date'].dt.year.astype('Int64')

df['year']

0        1995
1        1995
2        1995
3        1995
4        1995
         ... 
45461    <NA>
45462    2011
45463    2003
45464    1917
45465    2017
Name: year, Length: 45466, dtype: Int64

In [None]:
#Convert NaT to 0 
df['year'] = df['year'].fillna(0)

In [8]:
# Drop the release_date column
df = df.drop('release_date', axis=1)

# Display the dataframe
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [9]:
# Print genres of the first movie
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [10]:
# trying to learn about literal_eval function from ast library

from ast import literal_eval

# Define a stringified list and output its type
a = "[1,2,3]"
print(type(a))

# Apply literal_eval and output type
b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [None]:
# Convert all NaN into stringified empty lists
df['genres'] = df['genres'].fillna('[]')

# Apply literal_eval to convert to the list object
df['genres'] = df['genres'].apply(literal_eval)

# Convert list of dictionaries to a list of strings
df['genres'] = df['genres'].apply(
    lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])



In [13]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,1995
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[comedy],106.0,5.7,173.0,1995


In [14]:
# Create a new feature by exploding genres
df = df.explode('genres')

# rename the column
df = df.rename(columns={'genres': 'genre'})

df.head()

Unnamed: 0,title,genre,runtime,vote_average,vote_count,year
0,Toy Story,animation,81.0,7.7,5415.0,1995
0,Toy Story,comedy,81.0,7.7,5415.0,1995
0,Toy Story,family,81.0,7.7,5415.0,1995
1,Jumanji,adventure,104.0,6.9,2413.0,1995
1,Jumanji,fantasy,104.0,6.9,2413.0,1995


In [19]:
def build_chart(df, percentile=0.8):
    # Ask for preferred genres
    print("Input preferred genre")
    genre = input()

    # Ask for lower limit of duration
    print("Input shortest duration")
    low_time = int(input())

    # Ask for upper limit of duration
    print("Input longest duration")
    high_time = int(input())

    # Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())

    # Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())

    # define a new movies variable to store the preferred movies
    movies = df.copy()

    # Filter based on the condition
    movies = movies[(movies['genre'] == genre) &
                    (movies['runtime'] >= low_time) &
                    (movies['runtime'] <= high_time) &
                    (movies['year'] >= low_year) &
                    (movies['year'] <= high_year)]

    # Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)

    # Only consider movies that have higher than m votes.
    q_movies = movies.copy().loc[movies['vote_count'] >= m]

    # Calculate score using the IMDB formula
    q_movies['score'] = (q_movies['vote_count']/(q_movies['vote_count']+m)
                         * q_movies['vote_average'] + m/(q_movies['vote_count'] + m)*C)

    # Sort the movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)

    return q_movies

In [20]:
# Generate the chart for the top action movies and display top 5.
build_chart(df).head()

Input preferred genre
Input shortest duration
Input longest duration
Input earliest year
Input latest year


Unnamed: 0,title,genre,runtime,vote_average,vote_count,year,score
723,Ghost in the Shell,action,83.0,7.8,854.0,1995,7.521643
550,True Romance,action,120.0,7.5,762.0,1993,7.23198
3902,"O Brother, Where Art Thou?",action,106.0,7.3,1144.0,2000,7.131617
348,The Crow,action,102.0,7.3,980.0,1994,7.106412
3871,"Crouching Tiger, Hidden Dragon",action,120.0,7.2,949.0,2000,7.011634


In [21]:
# for animated movies between 30 minutes and 2 hours in length and released anywhere between 1990 and 2005
build_chart(df).head()

Input preferred genre
Input shortest duration
Input longest duration
Input earliest year
Input latest year


Unnamed: 0,title,genre,runtime,vote_average,vote_count,year,score
9698,Howl's Moving Castle,animation,119.0,8.2,2049.0,2004,7.994823
359,The Lion King,animation,89.0,8.0,5520.0,1994,7.926672
0,Toy Story,animation,81.0,7.7,5415.0,1995,7.6375
6232,Finding Nemo,animation,100.0,7.6,6292.0,2003,7.549423
546,The Nightmare Before Christmas,animation,76.0,7.6,2135.0,1993,7.4605
