### Predicting the Emmys
- Assumes Python 3.7-only
- Assumes already installed `omdbapi`

In [1]:
# Setting API KEY (belongs to Shoop)
OMDB_API_KEY = "bb88b350"

In [19]:
import os
import requests

import numpy as np
import pandas as pd

from omdbapi.movie_search import GetMovie

Original Kaggle dataset 8 columns: `id, year, category, nominee, staff, company, producer, win (True/False)`

In [3]:
######################################################################################
# DATA SOURCES
######################################################################################
# Importing Kaggle Dataset (https://www.kaggle.com/unanimad/emmy-awards, License: CC0)
# Also importing manually created 2020 nominees dataset
noms_df = pd.read_csv('the_emmy_awards(1949-2019).csv')
noms_2020_df = pd.read_csv('emmy_noms_raw_2020.csv')

In [4]:
######################################################################################
# DATA CLEANING
######################################################################################
# Dropping unnecessary 'id' column
noms_df = noms_df.drop(columns='id', axis=1)

# Ensuring we have ALL "Outstanding Drama Series" nominees
noms_df['category'] = noms_df['category'].replace({"OUTSTANDING DRAMATIC SERIES": "Outstanding Drama Series"})

# Filtering to ONLY 2 categories: "Outstanding Comedy Series" and "Outstanding Drama Series"
noms_df = noms_df[
    (noms_df['category'] == "Outstanding Comedy Series") | (noms_df['category'] == "Outstanding Drama Series")
]

# Appending the 2020 nominees dataset
# NOTE: The DF we will perform feature engineering on will have "award year" in DESCENDING order
# This is important to remember for the (!!!) WARNING (!!!) remarks in Feature Engineering section
noms_df = noms_df.append(noms_2020_df)
noms_df = noms_df.sort_values(by=['year', 'category'], ascending=False)

# Ensuring correct data-format for DataRobot (Emmys award show typically held around mid-September)
noms_df['year'] = noms_df['year'].astype(str) + '-09-15'
noms_df['year'] = pd.to_datetime(noms_df['year'], format='%Y-%m-%d', errors='coerce')

# Renaming
noms_df = noms_df.rename(columns={'year': 'award_year', 'company': 'network'})

In [5]:
noms_df['category'].value_counts()

Outstanding Drama Series     266
Outstanding Comedy Series    264
Name: category, dtype: int64

In [6]:
noms_df['win'].value_counts()

False    434
True      96
Name: win, dtype: int64

In [7]:
######################################################################################
# FEATURE ENGINEERING
######################################################################################
# Using OMDb API to fetch detailed series information for each nominee title
series_data_dict = {}
for title in noms_df['nominee'].unique():
    if title not in series_data_dict:
        series = GetMovie(title=title, api_key=OMDB_API_KEY, plot="full")
        series_data_dict[title] = series.get_data(
            "Year", "Rated", "Released", "Runtime", "Genre", "Director", "Writer", "Actors",
            "Plot", "Language", "Country", "Awards", "imdbRating", "imdbVotes", "imdbID", "totalSeasons",
        )
        # DOWNLOADING POSTERS
        imdb_id = series_data_dict[title].get('imdbID', 'N/A')
        poster_url = f"http://img.omdbapi.com/?i={imdb_id}&h=1200&apikey={OMDB_API_KEY}"
        poster_request = requests.get(poster_url)
        if poster_request.status_code == 200:
            with open(f"images/{imdb_id}.jpg", 'wb') as f:
                f.write(poster_request.content)


In [8]:
series_data_dict['Ozark']

{'Year': '2017–',
 'Rated': 'TV-MA',
 'Released': '21 Jul 2017',
 'Runtime': '60 min',
 'Genre': 'Crime, Drama, Thriller',
 'Director': 'N/A',
 'Writer': 'Bill Dubuque, Mark Williams',
 'Actors': 'Jason Bateman, Laura Linney, Sofia Hublitz, Skylar Gaertner',
 'Plot': 'The Byrdes and their teenage kids, Charlotte and Jonah, are, for all intents and purposes, an ordinary family with ordinary lives. Except for the job of Marty, a Chicago financial advisor who also serves as the top money launderer for the second largest drug cartel in Mexico. When things go awry, Marty must uproot his family from the skyscrapers of Chicago and relocate to the lazy lake region of the Missouri Ozarks.',
 'Language': 'English',
 'Country': 'USA',
 'Awards': 'Nominated for 2 Golden Globes. Another 8 wins & 74 nominations.',
 'imdbRating': '8.4',
 'imdbVotes': '170,435',
 'imdbID': 'tt5071412',
 'totalSeasons': '4'}

In [9]:
# Getting and appending selected series information for each nominee
rated = []
released = []
runtime = []
genre = []
director = []
writer = []
actors = []
plot = []
language = []
country = []
awards = []
imdb_rating = []
imdb_votes = []
total_seasons = []
has_won_before = []
number_times_won_before = []
received_gg_nom_before = []
received_gg_win_before = []
images_path = []

for row in noms_df.itertuples(index=False):
    rated.append(series_data_dict[row.nominee].get('Rated', 'N/A'))
    released_date = series_data_dict[row.nominee].get('Released', 'N/A')
    # date formatting
    released_date = released_date.replace(' ', '-')
    released.append(released_date)
    runtime.append(series_data_dict[row.nominee].get('Runtime', 'N/A'))
    genre.append(series_data_dict[row.nominee].get('Genre', 'N/A'))
    director.append(series_data_dict[row.nominee].get('Director', 'N/A'))
    writer.append(series_data_dict[row.nominee].get('Writer', 'N/A'))
    actors.append(series_data_dict[row.nominee].get('Actors', 'N/A'))
    plot.append(series_data_dict[row.nominee].get('Plot', 'N/A'))
    language.append(series_data_dict[row.nominee].get('Language', 'N/A'))
    country.append(series_data_dict[row.nominee].get('Country', 'N/A'))
    nom_awards = series_data_dict[row.nominee].get('Awards', 'N/A')
    awards.append(nom_awards)
    imdb_rating.append(series_data_dict[row.nominee].get('imdbRating', 'N/A'))
    imdb_votes.append(series_data_dict[row.nominee].get('imdbVotes', 'N/A'))
    total_seasons.append(series_data_dict[row.nominee].get('totalSeasons', 'N/A'))
    
    # feature engineering for whether nominee won previous Emmy award and number of times won
    # (!!!) WARNING (!!!): for these features, HAVE TO BE CAREFUL ABOUT TARGET LEAKAGE
    nominee_df = noms_df[noms_df['nominee'] == row.nominee]
    nominee_sub_df = nominee_df[nominee_df['award_year'] < row.award_year]
    has_won_before.append(nominee_sub_df['win'].any())
    number_times_won_before.append(nominee_sub_df['win'].sum())
    # feature engineering for whether nominee won/was nominated for Golden Globe award
    # (!!!) WARNING (!!!): be careful about target leakage, but due to not knowing
    # the dates when GG awards were given unsure how to best prevent leakage :(
    gg_nom_before, gg_win_before = False, False
    if "Golden Globe" in nom_awards or "Golden Globes" in nom_awards:
        if "Nominated" in nom_awards:
            gg_nom_before = True
        if "Won" in nom_awards:
            gg_win_before = True
    received_gg_nom_before.append(gg_nom_before)
    received_gg_win_before.append(gg_win_before)
    
    # POSTER IMAGES
    imdb_id = series_data_dict[row.nominee].get('imdbID', 'N/A')
    if os.path.exists(f"images/{imdb_id}.jpg"):
        images_path.append(f"images/{imdb_id}.jpg")
    else:
        images_path.append("N/A")


In [10]:
noms_df['rated'] = rated
noms_df['released'] = released
# date converting
noms_df['released'] = pd.to_datetime(noms_df['released'], format='%d-%b-%Y', errors='coerce')
noms_df['runtime'] = runtime
noms_df['genre'] = genre
noms_df['director'] = director
noms_df['writer'] = writer
noms_df['actors'] = actors
noms_df['plot'] = plot
noms_df['language'] = language
noms_df['country'] = country
noms_df['awards'] = awards
noms_df['imdb_rating'] = imdb_rating
noms_df['imdb_votes'] = imdb_votes
noms_df['total_seasons'] = total_seasons
noms_df['has_won_before'] = has_won_before
noms_df['number_times_won_before'] = number_times_won_before
noms_df['received_gg_nom_before'] = received_gg_nom_before
noms_df['received_gg_win_before'] = received_gg_win_before
noms_df['images'] = images_path

In [11]:
# Not sure why there's a value called "key not found!", so replace all occurences with "N/A"
noms_df = noms_df.replace({"key not found!": "N/A"})

In [28]:
noms_df[['nominee', 'win']].head(20)

Unnamed: 0,nominee,win
0,Better Call Saul,False
1,The Crown,False
2,The Handmaid's Tale,True
3,Killing Eve,False
4,The Mandalorian,False
5,Ozark,False
6,Stranger Things,False
7,Succession,False
8,Curb Your Enthusiasm,False
9,Dead To Me,False


In [30]:
noms2_df = noms_df
# noms2_df['win'] = np.where((df.win == 'False'), 'asdf', df.win)
noms2_df.loc[(noms2_df['award_year'] > '2019-09-15') & (noms2_df['nominee'] == 'The Handmaid\'s Tale'), 'win'] = 'True'
noms2_df.loc[(noms2_df['award_year'] > '2019-09-15') & (noms2_df['nominee'] == 'The Marvelous Mrs. Maisel'), 'win'] = 'True'
noms2_df[['nominee', 'win']].head(20)

Unnamed: 0,nominee,win
0,Better Call Saul,False
1,The Crown,False
2,The Handmaid's Tale,True
3,Killing Eve,False
4,The Mandalorian,False
5,Ozark,False
6,Stranger Things,False
7,Succession,False
8,Curb Your Enthusiasm,False
9,Dead To Me,False


In [31]:
noms_no_awards_df = noms2_df.drop('awards', axis=1)

In [32]:
noms_no_posters_df = noms_no_awards_df.drop('images', axis=1)

In [35]:
noms_no_posters_df.to_csv('emmy_noms(1966-2020, for SHAP).csv', index=False, header=True)

In [12]:
######################################################################################
# OUTPUT DATA FOR DATAROBOT
######################################################################################
# Dropped columns from saved output: awards, images
noms_no_awards_df = noms_df.drop('awards', axis=1)
noms_no_posters_df = noms_no_awards_df.drop('images', axis=1)
noms_training_df = noms_no_posters_df[noms_no_posters_df['award_year'] != '2020-09-15']
noms_testing_df = noms_no_posters_df[noms_no_posters_df['award_year'] == '2020-09-15']
noms_testing_df = noms_testing_df.drop('win', axis=1)
# DF of available posters
noms_with_posters_df = noms_no_awards_df[noms_no_awards_df['award_year'] != '2020-09-15']
noms_with_posters_df = noms_with_posters_df[noms_with_posters_df['images'] != 'N/A']
# CSV output
noms_no_posters_df.to_csv('noms_full_dataset.csv', index=False, header=True)
noms_training_df.to_csv('emmy_noms_training(1966-2019).csv', index=False, header=True)
noms_testing_df.to_csv('emmy_noms_testing(2020).csv', index=False, header=True)
noms_with_posters_df.to_csv('emmy_noms_with_posters(1966-2019).csv', index=False, header=True)

In [13]:
######################################################################################
# PLAYGROUND
######################################################################################
# movie = GetMovie(title="Atlanta", api_key=OMDB_API_KEY, plot="full")
# movie.get_all_data()
# movie.get_data(
#     "Year", "Rated", "Released", "Runtime", "Genre", "Director", "Writer",
#     "Actors", "Plot", "Language", "Country", "Awards", "imdbRating", "imdbVotes", "totalSeasons"
# )