# Extracting Data from TheMovieDB for all nominated and winners at the Academy Awards (Oscars)

In [29]:
import numpy as np
import pandas as pd

# For API usage
import requests as r

#our lifesaver, the progress bar
from tqdm import tqdm

In [30]:
#API Key 
API_KEY = 'b4c353d61e8791dac39e77e69a2c08d3' 

##This is to extract our the whole database from their daily dump
simple_extract = pd.read_json('datasets/movie_ids_03_25_2023.json', lines=True)
nominated = pd.read_csv('DataSet Work/the_oscar_award.csv')
print(simple_extract.dtypes)
print(simple_extract.head(5))
print(nominated.dtypes)
print(nominated.head(5))

adult                bool
id                  int64
original_title     object
popularity        float64
video                bool
dtype: object
   adult     id                      original_title  popularity  video
0  False   3924                             Blondie       3.502  False
1  False   6124                 Der Mann ohne Namen       0.600  False
2  False   8773                 L'Amour à vingt ans       2.722  False
3  False  25449  New World Disorder 9: Never Enough       1.097  False
4  False  31975      Sesame Street: Elmo Loves You!       0.600   True
year_film         int64
year_ceremony     int64
ceremony          int64
category         object
name             object
film             object
winner             bool
dtype: object
   year_film  year_ceremony  ceremony category                 name  \
0       1927           1928         1    ACTOR  Richard Barthelmess   
1       1927           1928         1    ACTOR        Emil Jannings   
2       1927           1928        

In [31]:
nominated = nominated[['year_film','category','film','winner']]
print(nominated.head(5))

   year_film category              film  winner
0       1927    ACTOR         The Noose   False
1       1927    ACTOR  The Last Command    True
2       1927  ACTRESS   A Ship Comes In   False
3       1927  ACTRESS        7th Heaven    True
4       1927  ACTRESS    Sadie Thompson   False


In [32]:
nominated_clean = nominated.dropna().reset_index(drop=True)

In [33]:
nominated_clean_noCat = nominated_clean.drop(columns=['category'])

In [34]:
nominated_clean_noCat = nominated_clean_noCat.drop_duplicates().reset_index(drop=True)
nominated_clean_noCat

Unnamed: 0,year_film,film,winner
0,1927,The Noose,False
1,1927,The Last Command,True
2,1927,A Ship Comes In,False
3,1927,7th Heaven,True
4,1927,Sadie Thompson,False
...,...,...,...
5852,2022,Top Gun: Maverick,True
5853,2022,Avatar: The Way of Water,True
5854,2022,Glass Onion: A Knives Out Mystery,False
5855,2022,Women Talking,True


### Some films may have won only some but there will be a repeat entry of false if they did not win every award possible, making for loop to remove these duplicates as well.

In [35]:
# If there are duplicates films with both True and false entry, we drop the false entry as the film did at least win an oscar
# To do this, groupby first to sort by TRUE To make sure the true version is kept
nominated_clean_noCat = nominated_clean_noCat.sort_values(by=['winner'], ascending=False)

nominated_clean_noCat = nominated_clean_noCat.drop_duplicates(subset=['film','year_film'], keep='first')
# Now that there are no more duplicates, we can resort by original index and then reset the index
nominated_clean_noCat = nominated_clean_noCat.sort_index().reset_index(drop=True)

In [36]:
nominated_clean_noCat 

Unnamed: 0,year_film,film,winner
0,1927,The Noose,False
1,1927,The Last Command,True
2,1927,A Ship Comes In,False
3,1927,7th Heaven,True
4,1927,Sadie Thompson,False
...,...,...,...
5093,2022,Top Gun: Maverick,True
5094,2022,Avatar: The Way of Water,True
5095,2022,Glass Onion: A Knives Out Mystery,False
5096,2022,Women Talking,True


In [37]:
# Duplicating the dataframe for later use
original_nominated_cleaned = nominated_clean_noCat.copy()

Unnamed: 0,year_film,film,winner
0,1927,The Noose,False
1,1927,The Last Command,True
2,1927,A Ship Comes In,False
3,1927,7th Heaven,True
4,1927,Sadie Thompson,False
...,...,...,...
5093,2022,Top Gun: Maverick,True
5094,2022,Avatar: The Way of Water,True
5095,2022,Glass Onion: A Knives Out Mystery,False
5096,2022,Women Talking,True


# Downloading info from TheMovieDB for movies that had been nominated or won

In [None]:
# Downloading the data from the API directly from nominated_clean_noCat films

# This is the function that will be used to extract the data from the API
https://api.themoviedb.org/3/search/movie?api_key=<<api_key>>&language=en-US&page=1&include_adult=false


In [38]:
# convert both set of titles to all uppercase and remove non-alphanumeric characters

nominated_clean_noCat['film'] = nominated_clean_noCat['film'].str.upper().str.replace(r'[^A-Z0-9]', '')
simple_extract['original_title'] = simple_extract['original_title'].str.upper().str.replace(r'[^A-Z0-9]', '')


  nominated_clean_noCat['film'] = nominated_clean_noCat['film'].str.upper().str.replace(r'[^A-Z0-9]', '')
  simple_extract['original_title'] = simple_extract['original_title'].str.upper().str.replace(r'[^A-Z0-9]', '')


In [39]:
simple_extract = simple_extract.dropna().drop_duplicates(subset=['original_title'], keep='first')
simple_extract

Unnamed: 0,adult,id,original_title,popularity,video
0,False,3924,BLONDIE,3.502,False
1,False,6124,DERMANNOHNENAMEN,0.600,False
2,False,8773,LAMOURVINGTANS,2.722,False
3,False,25449,NEWWORLDDISORDER9NEVERENOUGH,1.097,False
4,False,31975,SESAMESTREETELMOLOVESYOU,0.600,True
...,...,...,...,...,...
791501,False,1104366,PETSSEASON1,0.000,False
791503,False,1104369,MIYUKIWATANABEMIRUNEKO,0.000,False
791504,False,1104370,CAFFRIO,0.000,False
791506,False,1104372,REDMEATHAND,0.000,False


In [12]:
# Downloading from TheMovieDB with API
# We will get the movie id from the simple_extract dataframe and then use that to get the movie details
# We will then use the movie id to extract movie details from TheMovieDB
movie_ids = []
notFoundMovies = []

for i in tqdm(range(len(nominated_clean_noCat))):
    # Get the movie id from the simple_extract dataframe
    movie_id = simple_extract[simple_extract['original_title'] == nominated_clean_noCat['film'][i]]['id']

    if not movie_id.empty:  # Check if the movie_id is found in simple_extract dataframe
        movie_ids.append(int(movie_id))  # Convert to integer and append to the list
    else:
        notFoundMovies.append(nominated_clean_noCat['film'][i])

print(len(notFoundMovies))
notFoundMovies
# Get the movie details from TheMovieDB
#movie_details = r.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, API_KEY)).json()

100%|██████████| 5098/5098 [02:22<00:00, 35.84it/s]

521





['CHANG',
 'FOURDEVILS',
 'HOLLYWOODREVUE',
 'SONGOFTHEFLAME',
 'BIGBROADCASTOF1936',
 'WINGSOVERMTEVEREST',
 'BRIDEOFFRANKENSTEIN',
 'TRAILOFTHELONESOMEPINE',
 'OLDMILLPOND',
 'WALTERWANGERSVOGUESOF1938',
 'ONEHUNDREDMENANDAGIRL',
 'GRANDILLUSION',
 'THEDARKCOMMAND',
 'THEDEVILPAYSOFF',
 'MRGARDENIAJONES',
 'MOSCOWSTRIKESBACK',
 'PRELUDETOWAR',
 'UNITEDSTATESMARINEBAND',
 'THEBATTLEOFRUSSIA',
 'SILENTVILLAGE',
 'BISMARCKCONVOYSMASHED',
 'THE500HATSOFBARTHOLOMEWCUBBINS',
 'YANKEEDOODLEMOUSE',
 'CAVALCADEOFDANCEWITHVELOZANDYOLANDA',
 'HOLLYWOODINUNIFORM',
 'ARTUROTOSCANINI',
 'WOMANOFTHETOWN',
 'ANDTOTHINKISAWITONMULBERRYSTREET',
 'THESULLIVANS',
 'GIJOE',
 'MIGHTYMOUSEINGYPSYLIFE',
 'THEPOETANDPEASANT',
 'SCREENSNAPSHOTS25THANNIVERSARY',
 'THREEISAFAMILY',
 'PARAMOUNTNEWSISSUE37TWENTIETHANNIVERSARYISSUE19271947',
 'CHOPINSMUSICALMOMENTS',
 'VACATIONFROMMARRIAGE',
 'CHILDRENOFPARADISE',
 'OPENCITY',
 'AVOICEISBORNTHESTORYOFNIKLOSGAFNI',
 'ACAGEOFNIGHTINGALES',
 'SYMPHONYOFACITY',
 'THEA

In [40]:
original_nominated_cleaned

Unnamed: 0,year_film,film,winner
0,1927,The Noose,False
1,1927,The Last Command,True
2,1927,A Ship Comes In,False
3,1927,7th Heaven,True
4,1927,Sadie Thompson,False
...,...,...,...
5093,2022,Top Gun: Maverick,True
5094,2022,Avatar: The Way of Water,True
5095,2022,Glass Onion: A Knives Out Mystery,False
5096,2022,Women Talking,True


In [44]:
# Create new list of not found that includes year from the original_nominated_cleaned dataframe
original_notFoundMovies_df = pd.DataFrame(columns=original_nominated_cleaned.columns)

# Iterate through notFoundMovies
for i in range(len(notFoundMovies)):
    # Filter rows from original_nominated_cleaned where the simplified name matches notFoundMovies[i]
    filtered_df = original_nominated_cleaned[original_nominated_cleaned['film'].str.upper().str.replace(r'[^A-Z0-9]', '') == notFoundMovies[i]]

    if not filtered_df.empty:
        original_notFoundMovies_df = original_notFoundMovies_df.append(filtered_df, ignore_index=True)

  filtered_df = original_nominated_cleaned[original_nominated_cleaned['film'].str.upper().str.replace(r'[^A-Z0-9]', '') == notFoundMovies[i]]
  original_notFoundMovies_df = original_notFoundMovies_df.append(filtered_df, ignore_index=True)


In [46]:
original_notFoundMovies_df
# how many winners in the not found movies
original_notFoundMovies_df['winner'].value_counts()

False    437
True     106
Name: winner, dtype: int64

In [69]:
movie_title = ""
movie_year = ""
notFoundMovies = []
from urllib.parse import quote
# Extract movie ID from TheMovieDB API by appending to original movie_ids list
for i in tqdm(range(len(original_notFoundMovies_df))):
    # Get the movie id from the simple_extract dataframe
    movie_title = original_notFoundMovies_df['film'][i]
    title_encoded = quote(movie_title)
    movie_year = original_notFoundMovies_df['year_film'][i]

    resp = r.get(f'https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&language=en-US&query={title_encoded}&page=1&include_adult=false&year={movie_year}')
    try:
        movie_id = resp.json()['results'][0]['id']
    except IndexError:
        try:
            movie_year += 1
            resp = r.get(f'https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&language=en-US&query={title_encoded}&page=1&include_adult=false&year={movie_year}')
            movie_id = resp.json()['results'][0]['id']
        except IndexError:
            try:
                movie_year -= 2
                resp = r.get(f'https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&language=en-US&query={title_encoded}&page=1&include_adult=false&year={movie_year}')
                movie_id = resp.json()['results'][0]['id']
            except IndexError:
                try:
                    resp = r.get(f'https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&language=en-US&query={title_encoded}&page=1&include_adult=false')
                    movie_id = resp.json()['results'][0]['id']
                except IndexError:
                    movie_id = movie_title
    #movie_ids.append(movie_id)
    if movie_id != movie_title:
        movie_ids.append(movie_id)
    else:
        notFoundMovies.append(movie_title)

100%|██████████| 543/543 [01:03<00:00,  8.56it/s]


### Printing out number of nominated/winners that were not found on database

In [70]:
# Check how many not found
print(len(notFoundMovies))
notFoundMovies

22


["Walter Wanger's Vogues of 1938",
 'One Hundred Men and a Girl',
 'Mr. Gardenia Jones',
 'Paramount News Issue #37 (Twentieth Anniversary Issue!  1927.....1947)',
 "Chopin's Musical Moments",
 'The Strauss Fantasy',
 'The Running, Jumping and Standing-Still Film',
 'Song without End (The Story of Franz Liszt)',
 'La Grande Olimpiade (Olympic Games 1960)',
 "Meredith Willson's The Music Man",
 "The Cliff Dwellers (formerly titled 'One Plus One')",
 "Jacques-Yves Cousteau's World without Sun",
 'A Year toward Tomorrow',
 'Herb Alpert and the Tijuana Brass Double Feature',
 'The Legendary Champions',
 'The Further Adventures of Uncle Sam: Part Two',
 'The Doonesbury Special',
 'Mama Turns a Hundred',
 'The Yellow Star - The Persecution of the Jews in Europe 1933-45',
 "Eyes on the Prize: America's Civil Rights Years/Bridge to Freedom 1965",
 'Senzeni Na? (What Have We Done?)',
 "Birdnesters of Thailand (aka 'Shadow Hunters')"]

## Extracting the rest of the data

In [72]:
print(len(movie_ids))

5656


In [71]:
#Method for getting stats

def compute_top_seven_avg(df, col):
    try:
        top = df[col].sort_values(ascending=False)
        top = top[:min(len(df), 7)]
        return top.sum() / len(top)
    except:
        return np.nan
    
def get_popularity_stats(resp, col):
    result = {}

    try:
        data = pd.DataFrame(resp.json()[col])
        
        try:
            result['avg'] = compute_top_seven_avg(data, 'popularity')
        except:
            result['avg'] = np.nan

        try:
            result['sum'] = data['popularity'].sum()
        except:
            result['sum'] = np.nan

        try:
            result['top'] = data['popularity'].max()
        except:
            result['top'] = np.nan

    except KeyError:
        result['avg'] = np.nan
        result['sum'] = np.nan
        result['top'] = np.nan
        
    return result


In [75]:
#movie ID gets stats of movie
#credit get 

#Get Movie Stats through MovieID
rows = []

col = ['id', 'title', 'budget', 'revenue', 'release_date', 'popularity', 'vote_average', 'runtime', 'top_casts_popularity_avg', 'casts_popularity_sum', 'top_cast_popularity',
       'top_crews_popularity_avg', 'crews_popularity_sum', 'top_crew_popularity']
for i in tqdm(range(len(movie_ids))):
    row = []
    j = movie_ids[i]
    resp = r.get(f'https://api.themoviedb.org/3/movie/{j}?api_key={API_KEY}')
    try:
        budget = resp.json()['id']
        if budget == 0:
            row.append(np.nan)
        else:
            row.append(budget)
    except KeyError:
        row.append(np.nan)
    try:
        budget = resp.json()['title']
        if budget == 0:
            row.append(np.nan)
        else:
            row.append(budget)
    except KeyError:
        row.append(np.nan)
    try:
        budget = resp.json()['budget']
        if budget == 0:
            row.append(np.nan)
        else:
            row.append(budget)
    except KeyError:
        row.append(np.nan)
    try:
        revenue = resp.json()['revenue']
        if revenue == 0:
            row.append(np.nan)
        else:
            row.append(revenue)
    except KeyError:
        row.append(np.nan)
    try:
        release = resp.json()['release_date'][0:4]
        row.append(release)
    except:
        row.append(np.nan)
    try:
        remaining = resp.json()['popularity']
        if(remaining == 0):
            row.append(np.nan)
        else:
            row.append(remaining)
    except:
        row.append(np.nan)

    try:
        voteA = resp.json()['vote_average']
        if(voteA == 0):
            row.append(np.nan)
        else:
            row.append(voteA)
    except:
        row.append(np.nan)

    try:
        runtime = resp.json()['runtime']
        if(runtime == 0):
            row.append(np.nan)
        else:
            row.append(runtime)
    except:
        row.append(np.nan)

    #credits part
    resp = r.get(f'https://api.themoviedb.org/3/movie/{j}/credits?api_key={API_KEY}&language=en-US')
    casts_stats = get_popularity_stats(resp, 'cast')

    row.append(casts_stats['avg'])
    row.append(casts_stats['sum'])
    row.append(casts_stats['top'])
        
    # Get crews' popularity statistics from the movie
    crews_stats = get_popularity_stats(resp, 'crew')
        
    # Append to row
    row.append(crews_stats['avg'])
    row.append(crews_stats['sum'])
    row.append(crews_stats['top'])

    rows.append(row)

property = pd.DataFrame(rows, columns = col)

#combine = pd.concat([movie_data, property], axis=1)


print(property.head(5))

100%|██████████| 5656/5656 [1:00:37<00:00,  1.55it/s]

         id             title     budget    revenue release_date  popularity  \
0  113167.0         The Noose        NaN        NaN         1928       0.600   
1   43197.0  The Last Command        NaN        NaN         1955       2.108   
2  104212.0   A Ship Comes In        NaN        NaN         1928       0.790   
3   82474.0        7th Heaven      119.0        NaN         1927       5.695   
4   42538.0    Sadie Thompson  1000000.0  7000000.0         1928       1.643   

   vote_average  runtime  top_casts_popularity_avg  casts_popularity_sum  \
0           NaN     65.0                  1.582286                15.598   
1          6.40    110.0                 11.207714               109.128   
2          5.50     70.0                  1.531000                11.317   
3          7.37    110.0                  2.758571                25.023   
4          6.80     97.0                  2.442286                18.896   

   top_cast_popularity  top_crews_popularity_avg  crews_popula




In [76]:

property.to_csv('datasets/oscar_nom_extract.csv', index=False)

In [78]:
original_nominated_cleaned.to_csv('Nominated/winner_nom.csv')