# Problems
- What makes a movie good, bad? How would you define those? (Classification)
- Does the popularity of the casts, crews and the movie itself affects the quality of the movie?
- Logistic Regression vs Decision Tree vs Random Forest Classifier vs XGB Classifier vs SVM Classifier

# Import necessary modules

In [1]:
# For data processing
import numpy as np
import pandas as pd

# For API usage
import requests as r

# For progress bar
from tqdm import tqdm

# Helper functions for data extraction

In [23]:
def compute_top_avg(df, col, p):
    """
    NOTE: This function is deprecated. In the main function, 
          we use the compute_top_five_avg function instead.
    
    Computes the average value of the top p% of a 
    certain column (col) in the DataFrame (df).
    
    Minimum entries to be included is 1.
    """
    
    try:
        # Sort the column in descending order
        top = df[col].sort_values(ascending=False)
        
        # Selects the top p%
        top = top[:max(int(len(df) * p), 1)]
        
        return top.sum() / len(top)
    except:
        return np.nan
    
    
def compute_top_five_avg(df, col):
    """
    Computes the average of the top 5 values of a 
    column (col) in the DataFrame (df).

    If there are less than 5 entries in df, it will use
    the top len(df) entries in df.
    """
    
    try:
        # Sort the column in descending order
        top = df[col].sort_values(ascending=False)
        
        # Selects the top min(len(df), 5) entries
        top = top[:min(len(df), 5)]
        return top.sum() / len(top)
    except:
        return np.nan
    

def get_popularity_stats(resp, col):
    """
    Gets the column's (col):
        1. Average popularity value of the top p% entries
        2. Total popularity value of all the entries
        3. Highest popularity value
    from the response (resp).
    
    Returns a dictionary containing the 3 data above.
    """
    
    result = {}
    
    try:
        data = pd.DataFrame(resp.json()[col])
        
        # Computes the average popularity value of the top p% entries
        try:
            result['avg'] = compute_top_five_avg(data, 'popularity')
        except:
            result['avg'] = np.nan
        
        # Computes the total popularity value of all the entries
        try:
            result['sum'] = data['popularity'].sum()
        except:
            result['sum'] = np.nan 
        
        # Computes the highest popularity value
        try:
            result['top'] = data['popularity'].max()
        except:
            result['top'] = np.nan

    except KeyError:
        result['avg'] = np.nan
        result['sum'] = np.nan
        result['top'] = np.nan
        
    return result

In [24]:
API_KEY = 'db6bd34e31c99738cfb114ed7ad6d566'


def get_movies_data_init():
    """
    Returns a DF containing:
        1. Title
        2. ID
        3. Popularity value
        4. Vote count
        5. Vote average (or rating)
    for each movie.
    """
    
    # Create empty DF to store the movie data
    movie = pd.DataFrame()
    
    # Initial response to get the number of pages
    resp = r.get("https://api.themoviedb.org/3/discover/movie?" \
                 f"api_key={API_KEY}&language=en-US&sort_by=revenue.desc&" \
                 "include_adult=false&include_video=false&page=1")
    total_pages = resp.json()['total_pages']
    
    # Relevant movie columns
    cols = ['title', 'id', 'popularity', 'vote_count', 'vote_average']
    
    for i in tqdm(range(1,total_pages + 1)):
        resp = r.get(f"https://api.themoviedb.org/3/discover/movie?api_key={API_KEY}&language=en-US&sort_by=revenue.desc&include_adult=false&include_video=false&page={i}")
        
        # Select the relevant columns
        data = pd.DataFrame(resp.json()['results'])[cols]
        movie = movie.append(data)
    
    movie = movie.reset_index(drop=True)
    
    return movie


def get_movies_stats(movie_df):
    """
    Gets the casts' and crews':
        1. Average popularity value of the top p% entries
        2. Total popularity value of all the entries
        3. Highest popularity value
    for each movie and its budget.
    
    Returns a DF containing the above 7 data.
    """
    
    cols = ['top_casts_popularity_avg', 'casts_popularity_sum', 'top_cast_popularity',
            'top_crews_popularity_avg', 'crews_popularity_sum', 'top_crew_popularity', 
            'budget']
    
    rows = []
    
    for i in tqdm(range(len(movie_df))):
        row = []
        movie_id = movie_df['id'][i]
        
        resp = r.get(f'https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={API_KEY}&language=en-US')
        
        # Get casts' popularity statistics from the movie
        casts_stats = get_popularity_stats(resp, 'cast')
        
        # Append to row
        row.append(casts_stats['avg'])
        row.append(casts_stats['sum'])
        row.append(casts_stats['top'])
        
        # Get crews' popularity statistics from the movie
        crews_stats = get_popularity_stats(resp, 'crew')
        
        # Append to row
        row.append(crews_stats['avg'])
        row.append(crews_stats['sum'])
        row.append(crews_stats['top'])
        
        # Get the movie's budget
        resp = r.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&language=en-US')
        
        try:
            budget = resp.json()['budget']
            if budget == 0:
                row.append(np.nan)
            else:
                row.append(budget)
        except KeyError:
            row.append(np.nan)
            
        rows.append(row)
    
    # Create a DF containing all the data for all movies
    stats_df = pd.DataFrame(rows, columns=cols)

    return stats_df


def get_movies_data_all():
    """
    Returns the complete DF with the necessary columns using helper functions above.
    """
    
    # Get the initial DF
    movie_df = get_movies_data_init()
    
    # Get the movies' statistics
    stats_df = get_movies_stats(movie_df)
    
    # Concatenate both DFs
    complete_df = pd.concat([movie_df, stats_df], axis=1)
    
    # Rearrange the columns so that vote_count and vote_average lies in the last 2 columns
    cols = complete_df.columns.tolist()
    cols = cols[:3] + cols[5:] + cols[3:5]
    
    complete_df = complete_df[cols]
    
    return complete_df

In [25]:
df = get_movies_data_all()
df.to_csv('movie-full-dataset.csv')

100%|██████████| 500/500 [02:49<00:00,  2.95it/s]
100%|██████████| 10000/10000 [1:30:21<00:00,  1.84it/s]


In [3]:
df = pd.read_csv('movie-full-dataset2.csv', index_col=0)
df

Unnamed: 0,title,id,popularity,top_casts_popularity_avg,casts_popularity_sum,top_cast_popularity,top_crews_popularity_avg,crews_popularity_sum,top_crew_popularity,budget,vote_count,vote_average
0,Avengers: Endgame,299534,320.418,30.150200,700.041,43.074,10.2326,458.614,14.353,356000000.0,17697,8.3
1,Avatar,19995,110.340,7.812800,90.086,9.458,4.3720,716.166,4.712,237000000.0,23132,7.5
2,Titanic,597,88.331,11.909400,194.031,24.581,4.4182,118.140,4.943,200000000.0,18936,7.9
3,Star Wars: The Force Awakens,140607,56.382,14.362000,354.880,17.539,5.5236,213.616,11.795,245000000.0,15647,7.4
4,Avengers: Infinity War,299536,337.938,30.150200,523.251,43.074,9.7936,492.311,14.353,300000000.0,21403,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,The Poison Rose,529983,18.002,14.087600,115.494,18.526,1.5426,60.225,2.866,,179,4.4
9996,Wintersleepers,781,4.079,2.632600,17.451,3.455,1.1760,12.480,1.484,,27,7.0
9997,The Cement Garden,14832,8.430,1.677400,11.987,2.392,1.0844,6.022,1.547,,53,7.2
9998,Behind the Blue Door,425722,1.988,1.288000,10.640,1.960,0.6320,2.528,0.728,,19,5.6


In [5]:
df.head()

Unnamed: 0,title,id,popularity,top_casts_popularity_avg,casts_popularity_sum,top_cast_popularity,top_crews_popularity_avg,crews_popularity_sum,top_crew_popularity,budget,vote_count,vote_average
0,Avengers: Endgame,299534,320.418,30.1502,700.041,43.074,10.2326,458.614,14.353,356000000.0,17697,8.3
1,Avatar,19995,110.34,7.8128,90.086,9.458,4.372,716.166,4.712,237000000.0,23132,7.5
2,Titanic,597,88.331,11.9094,194.031,24.581,4.4182,118.14,4.943,200000000.0,18936,7.9
3,Star Wars: The Force Awakens,140607,56.382,14.362,354.88,17.539,5.5236,213.616,11.795,245000000.0,15647,7.4
4,Avengers: Infinity War,299536,337.938,30.1502,523.251,43.074,9.7936,492.311,14.353,300000000.0,21403,8.3


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   title                     10000 non-null  object 
 1   id                        10000 non-null  int64  
 2   popularity                10000 non-null  float64
 3   top_casts_popularity_avg  9937 non-null   float64
 4   casts_popularity_sum      9937 non-null   float64
 5   top_cast_popularity       9937 non-null   float64
 6   top_crews_popularity_avg  9917 non-null   float64
 7   crews_popularity_sum      9917 non-null   float64
 8   top_crew_popularity       9917 non-null   float64
 9   budget                    7123 non-null   float64
 10  vote_count                10000 non-null  int64  
 11  vote_average              10000 non-null  float64
dtypes: float64(9), int64(2), object(1)
memory usage: 1015.6+ KB


In [7]:
# Data Cleaning
cleaned_df = df.drop(columns=['title', 'id', 'budget']).dropna()
cleaned_df.to_csv('cleaned-movie-dataset.csv')

In [8]:
cleaned_df = pd.read_csv('cleaned-movie-dataset.csv', index_col=0)
cleaned_df

Unnamed: 0,popularity,top_casts_popularity_avg,casts_popularity_sum,top_cast_popularity,top_crews_popularity_avg,crews_popularity_sum,top_crew_popularity,vote_count,vote_average
0,320.418,30.150200,700.041,43.074,10.2326,458.614,14.353,17697,8.3
1,110.340,7.812800,90.086,9.458,4.3720,716.166,4.712,23132,7.5
2,88.331,11.909400,194.031,24.581,4.4182,118.140,4.943,18936,7.9
3,56.382,14.362000,354.880,17.539,5.5236,213.616,11.795,15647,7.4
4,337.938,30.150200,523.251,43.074,9.7936,492.311,14.353,21403,8.3
...,...,...,...,...,...,...,...,...,...
9995,18.002,14.087600,115.494,18.526,1.5426,60.225,2.866,179,4.4
9996,4.079,2.632600,17.451,3.455,1.1760,12.480,1.484,27,7.0
9997,8.430,1.677400,11.987,2.392,1.0844,6.022,1.547,53,7.2
9998,1.988,1.288000,10.640,1.960,0.6320,2.528,0.728,19,5.6
