In [None]:
# Nicole's code starts here 
# Importing "The Numbers" data & cleaning it up

In [None]:
import pandas as pd
import numpy as np
from config import OMB_api_key
import requests
import json
from pprint import pprint

In [None]:
numbers_df = pd.read_csv('DataFiles/TheNumbers_Original.csv')
print(numbers_df.shape)
numbers_df.head(1)

In [None]:
# Create month released column & add to dataframe. Probably don't need to do this since we can
# return the month after turning it into a datetime data type
numbers_df['Domestic Release Date'] = numbers_df['Domestic Release Date'].astype('datetime64[ns]')
numbers_df['Worldwide Release Date'] = numbers_df['Worldwide Release Date'].astype('datetime64[ns]')
month = pd.DatetimeIndex(numbers_df['Domestic Release Date']).month
numbers_df.insert(3, 'Month Released (Domestic)', month)
numbers_df.head(1)

In [None]:
# Convert columns 11 + to integer
numbers_df[numbers_df.columns[11:]] = numbers_df[numbers_df.columns[11:]].apply\
(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

In [None]:
# Adding available oscar count per year. Somebody please check these calculations if we use this!
numbers_df['Total Oscars Awarded in Year'] = ''
for index, row in numbers_df.iterrows():
    year = row['Year Released (Domestic)']
    if year == 1980:
        numbers_df.loc[index, 'Total Oscars Awarded in Year'] = 22
    elif year in range(1981,1995) or year == 1999:
        numbers_df.loc[index, 'Total Oscars Awarded in Year'] = 23
    elif year in range(2001,2020):
        numbers_df.loc[index, 'Total Oscars Awarded in Year'] = 25
    else:
        numbers_df.loc[index, 'Total Oscars Awarded in Year'] = 24

In [None]:
# Title needs cleaning (remove apostrophes, colons, eplipses, "Ep. xxx:")
# The order it is replaced is sequential (ie: relacing Ep. I, followd by Ep. II returns I)

# Creating new title column so we can use original title later
numbers_df.insert(6, 'Query_Title', numbers_df['Title'])

# Replacing characters
numbers_df[numbers_df.columns[6:7]] = numbers_df[numbers_df.columns[6:7]].apply\
(lambda x: x.str.replace(":",'')).apply(lambda x: x.str.replace("Ep.","Episode"))

In [None]:
# Limit movies to 40 per year - defined as top 40 by adjusted gross ***
numbers_df = numbers_df.sort_values(['Year Released (Domestic)', 'Infl. Adj. Dom. Box Office'],
                                    ascending = [True, False])
numbers_df = numbers_df.reset_index(drop = True)

In [None]:
# Still limiting...
numbers_df['Year Index'] = ''
year_compare = 1980
count = 0
for index, row in numbers_df.iterrows():
    year = row['Year Released (Domestic)']
    if year == year_compare:
        count += 1
        numbers_df.loc[index, 'Year Index'] = count
    else:
        count = 1
        numbers_df.loc[index, 'Year Index'] = count
        year_compare += 1

In [None]:
# ... a little more & voila!
top_40_df = numbers_df.loc[(numbers_df['Year Index'] <=40), ['Title', 'Query_Title',
                                                             'Domestic Release Date',
                                                             'Year Released (Domestic)',
                                                             'Month Released (Domestic)',
                                                             'Infl. Adj. Dom. Box Office',
                                                             'Domestic Box Office',
                                                             'Genre', 'Theatrical Distributor',
                                                             'Total Oscars Awarded in Year']]
top_40_df = top_40_df.sort_values('Infl. Adj. Dom. Box Office', ascending = False)
top_40_df = top_40_df.reset_index(drop = True)
top_40_df.to_csv('DataFiles/TheNumbers_Cleaned.csv')
print(top_40_df.shape)
top_40_df.head(1)

In [None]:
# *********  This is the end of data_cleaning & start of request tests ***********

In [None]:
# Creating dataframe to hold subset request data
omdb_df = top_40_df.copy()
omdb_df['Awards'] = ''
omdb_df['Metascore'] = ''
omdb_df['IMDB'] = ''
omdb_df['Rotten Tomatoes'] = ''
omdb_df['Rated'] = ''
omdb_df['Director'] = ''
omdb_df['Runtime'] = ''
omdb_df['Country'] = ''

In [None]:
# *** The following cells are just for testing and can eventually be removed from code

In [None]:
# REQUEST TESTING (Okay to remove cell)
# Sample JSON in case you want to run one specific movie title
movie_title = "The battle of the five armies"
params = {'type': 'movie', 'apikey': OMB_api_key, 't': movie_title}
url = 'http://www.omdbapi.com/?t='
response = requests.get(url, params).json()
pprint(response)

In [None]:
# REQUEST TESTING (Okay to remove cell)
# Sample JSON in case you want to run one specific movie title
movie_title = "Epic"
year = 2013
params = {'type': 'movie', 'apikey': OMB_api_key, 't': movie_title, 'y': year}
url = 'http://www.omdbapi.com/?t='
response = requests.get(url, params).json()
pprint(response)

In [None]:
# REQUEST TESTING (Okay to remove cell)
# Subset dataframe for request testing
test_subset = omdb_df.iloc[25:39, :]


In [None]:
# REQUEST TESTING (Okay to remove cell)
# Testing Requests on subset.
# If one of the values within a found movie is missing, it stops inputting data into DF after that
#     and I told it to print that so we know (uncomment the metascore row to view this)
# If we find a lot of missing movies, we could look into adding a year parameter.
#     It looks like it returns the first movie found (ie: 'Star Wars' returns 'Star Wars IV')

params = {"type": "movie", "apikey": OMB_api_key}
url = "http://www.omdbapi.com/?t="
count = 0
for index, row in test_subset.iterrows():
    params["t"] = row["Query_Title"]
    response = requests.get(url, params).json()
    if response['Response'] == 'True':
        try:
            omdb_df.loc[index, 'Awards'] = response['Awards']
            omdb_df.loc[index, 'Metascore'] = response['Metascore']
            omdb_df.loc[index, 'IMDB'] = response['imdbRating']
            omdb_df.loc[index, 'Rotten Tomatoes'] = response['Ratings'][1]['Value']
            omdb_df.loc[index, 'Rated'] = response['Rated']
            omdb_df.loc[index, 'Director'] = response['Director']
            omdb_df.loc[index, 'Runtime'] = response['Runtime']
            omdb_df.loc[index, 'Country'] = response['Country']
        except:
            print(f'{row.Query_Title.upper()} (row {count}) has missing data')
        count += 1
    else:
        print(f'{row.Query_Title.upper()} (row {count}) was not found')
        count += 1

In [None]:
test_subset.head()

In [None]:
# ***** API requests All Data *****

# params = {"type": "movie", "apikey": OMB_api_key}
# url = "http://www.omdbapi.com/?t="
# count = 0
# for index, row in omdb_df.iterrows():
#     params['t'] = row["Query_Title"]
#     response = requests.get(url, params).json()
#     if response['Response'] == 'True':
#         try:
#             omdb_df.loc[index, 'Awards'] = response['Awards']
#             omdb_df.loc[index, 'Metascore'] = response['Metascore']
#             omdb_df.loc[index, 'IMDB'] = response['imdbRating']
#             omdb_df.loc[index, 'Rotten Tomatoes'] = response['Ratings'][1]['Value']
#             omdb_df.loc[index, 'Rated'] = response['Rated']
#             omdb_df.loc[index, 'Director'] = response['Director']
#             omdb_df.loc[index, 'Runtime'] = response['Runtime']
#             omdb_df.loc[index, 'Country'] = response['Country']
#         except:
#             print(f'{row.Query_Title.upper()} (row {count}) has missing data')
#         count += 1
#     else:
#         print(f'{row.Query_Title.upper()} (row {count}) was not found')
#         count += 1

In [None]:
# print(omdb_df.shape)
# omdb_df.head(1)

In [None]:
# # Saving what we have so far
# omdb_df.to_csv('DataFiles/First_API_Run_BETA.csv', index=False)

# Loading for demo purposes
omdb_df = pd.read_csv('DataFiles/First_API_Run_BETA.csv')
omdb_df.head(50)

In [None]:
# Nicole's code ends here

In [None]:
# jason's code starts here

In [None]:
# Creating 2 files one for data already populated (clean_test_df) for John to work with Analysis
# the second is for Marianne and I to parse through and find on OMDB
clean_test_df = omdb_df.dropna()
# clean_test_df.head(50)
# clean_test_df.to_csv('DataFiles/clean_test_data.csv', index=False)

In [None]:
to_be_cleaned_df = omdb_df[pd.isnull(omdb_df['Awards'])]
to_be_cleaned_df.head()
to_be_cleaned_df.shape
# to_be_cleaned_df.to_csv('DataFiles/to_be_clean_data.csv', index=False)

In [None]:
to_be_clean_data = pd.read_csv('DataFiles/to_be_clean_data.csv')
to_be_clean_data
jason_cleanup_df= to_be_clean_data.head(75)
jason_cleanup_df
#create a new dataframe with the movies that actually need cleaning
# j_cleaning_df = jason_cleanup_df[pd.isnull(jason_cleanup_df['Metascore'])&pd.isnull(jason_cleanup_df['IMDB'])]
# j_cleaning_df
# j_cleaning_df.shape

In [None]:
j_cleaning_df.at[[10],'Query_Title'] = "The battle of the five armies"
j_cleaning_df

j_cleaning_df.shape

In [None]:
# pull out lines that were mistakenly included into the nan dataset based on awards stats
# and save for re-merging later 
j_clean_awards_df = jason_cleanup_df.dropna(subset=['IMDB']) #'Metascore']), 'IMDB'])
j_clean_awards_df
j_clean_awards_df.shape

# jason_cleanup_df.to_csv('DataFiles/jason_cleanup_df.csv', index=False)

In [None]:
new_df = j_cleaning_df.append(j_clean_awards_df, ignore_index=True)
new_df

In [None]:
new_df.to_csv('DataFiles/jason_cleanup_done_df.csv', index=False)

In [None]:
#jason's code ends here

In [None]:
# marianne's code starts here

In [None]:
# marianne's code ends here

In [None]:
# NEXT STEPS
# Pull out NaN values from omdb_df and resave
# Create new dataframes with only NaN values & figure out how to make successful API calls on them
    # (might be a series of datframes & API calls after tweaking key words or maybe adding variable for year)
# Pull out oscar nominations and wins
# Save & review final dataframe