# Extracting data from MovieAPI DataBase
We realize that there is a limit but worked around to find that MovieAPI does a daily dump for us

In [8]:
# For data processing
import numpy as np
import pandas as pd

# For API usage
import requests as r

#our lifesaver, the progress bar
from tqdm import tqdm

In [4]:
#API Key 
API_KEY = 'b4c353d61e8791dac39e77e69a2c08d3' 

##This is to extract our the whole database from their daily dump
simple_extract = pd.read_json('datasets/movie_ids_03_25_2023.json', lines=True)
print(simple_extract.dtypes)
print(simple_extract.head(5))
 

adult                bool
id                  int64
original_title     object
popularity        float64
video                bool
dtype: object
   adult     id                      original_title  popularity  video
0  False   3924                             Blondie       3.502  False
1  False   6124                 Der Mann ohne Namen       0.600  False
2  False   8773                 L'Amour à vingt ans       2.722  False
3  False  25449  New World Disorder 9: Never Enough       1.097  False
4  False  31975      Sesame Street: Elmo Loves You!       0.600   True


In [5]:
cleanedSimple_data = simple_extract.drop(simple_extract[(simple_extract['adult'] == True) | (simple_extract['video'] == True)].index)
print(cleanedSimple_data.head(5))

#Verify adult videos and videos are false
print(cleanedSimple_data['adult'].unique())
print(cleanedSimple_data['video'].unique())

   adult     id                      original_title  popularity  video
0  False   3924                             Blondie       3.502  False
1  False   6124                 Der Mann ohne Namen       0.600  False
2  False   8773                 L'Amour à vingt ans       2.722  False
3  False  25449  New World Disorder 9: Never Enough       1.097  False
5  False      2                               Ariel       9.553  False
[False]
[False]


In [22]:
#We'll only need our ID column for this to create our actual dataset
movie_data = cleanedSimple_data.drop(columns=['adult','video']) 
movie_data = movie_data.sort_values(by=['id'])

#Drop movies with less than 40 popularity
movie_data = movie_data.drop(movie_data[movie_data['popularity'] <= 8].index)
#movie_data = movie_data.drop(movie_data[movie_data['popularity'] <= 101].index)
movie_data = movie_data.reset_index(drop=True)
#print(movie_data.head(5))
print(movie_data.count())
#print(movie_data.tail(5))

id                18350
original_title    18350
popularity        18350
dtype: int64


In [23]:
#Method for getting stats

def compute_top_seven_avg(df, col):
    try:
        top = df[col].sort_values(ascending=False)
        top = top[:min(len(df), 7)]
        return top.sum() / len(top)
    except:
        return np.nan
    
def get_popularity_stats(resp, col):
    result = {}

    try:
        data = pd.DataFrame(resp.json()[col])
        
        try:
            result['avg'] = compute_top_seven_avg(data, 'popularity')
        except:
            result['avg'] = np.nan

        try:
            result['sum'] = data['popularity'].sum()
        except:
            result['sum'] = np.nan

        try:
            result['top'] = data['popularity'].max()
        except:
            result['top'] = np.nan

    except KeyError:
        result['avg'] = np.nan
        result['sum'] = np.nan
        result['top'] = np.nan
        
    return result


In [24]:
#movie ID gets stats of movie
#credit get 

#Get Movie Stats through MovieID
rows = []

col = ['budget', 'revenue', 'release_date', 'popularity', 'vote_average', 'runtime', 'top_casts_popularity_avg', 'casts_popularity_sum', 'top_cast_popularity',
       'top_crews_popularity_avg', 'crews_popularity_sum', 'top_crew_popularity']
for i in tqdm(range(len(movie_data))):
    row = []
    j = movie_data.iloc[i]['id']
    resp = r.get(f'https://api.themoviedb.org/3/movie/{j}?api_key={API_KEY}')
    try:
        budget = resp.json()['budget']
        if budget == 0:
            row.append(np.nan)
        else:
            row.append(budget)
    except KeyError:
        row.append(np.nan)
    try:
        revenue = resp.json()['revenue']
        if revenue == 0:
            row.append(np.nan)
        else:
            row.append(revenue)
    except KeyError:
        row.append(np.nan)
    try:
        release = resp.json()['release_date'][0:4]
        row.append(release)
    except:
        row.append(np.nan)
    try:
        remaining = resp.json()['popularity']
        if(remaining == 0):
            row.append(np.nan)
        else:
            row.append(remaining)
    except:
        row.append(np.nan)

    try:
        voteA = resp.json()['vote_average']
        if(voteA == 0):
            row.append(np.nan)
        else:
            row.append(voteA)
    except:
        row.append(np.nan)

    try:
        runtime = resp.json()['runtime']
        if(runtime == 0):
            row.append(np.nan)
        else:
            row.append(runtime)
    except:
        row.append(np.nan)

    #credits part
    resp = r.get(f'https://api.themoviedb.org/3/movie/{j}/credits?api_key=b4c353d61e8791dac39e77e69a2c08d3&language=en-US')
    casts_stats = get_popularity_stats(resp, 'cast')

    row.append(casts_stats['avg'])
    row.append(casts_stats['sum'])
    row.append(casts_stats['top'])
        
    # Get crews' popularity statistics from the movie
    crews_stats = get_popularity_stats(resp, 'crew')
        
    # Append to row
    row.append(crews_stats['avg'])
    row.append(crews_stats['sum'])
    row.append(crews_stats['top'])

    rows.append(row)

property = pd.DataFrame(rows, columns = col)

combine = pd.concat([movie_data, property], axis=1)


print(combine.head(5))


  1%|▏         | 233/18350 [07:16<9:25:47,  1.87s/it] 


KeyboardInterrupt: 

In [21]:
combine.to_csv('datasets/test.csv')