# Phase II: Data Curation, Exploratory Analysis and Plotting (5\%)

### Team Members:
- Logan Lary
- Mark Tran
- Sabrina Valerjev

## Part 1: 
(1%) Expresses the central motivation of the project and explains the (at least) two key questions to be explored. Gives a summary of the data processing pipeline so a technical expert can easily follow along.

## Project Motivation

## Summary of the Data Processing Pipeline

## Part 2: 
(2\%) Obtains, cleans, and merges all data sources involved in the project.

In [24]:
# adding relevant imports
import requests
from requests_html import HTML
import json
import pathlib
import pandas as pd
import requests
from requests_html import HTML

In [29]:
# Source 1: Box Office Mojo
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w') as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        html_str, status = self.extract_html_str(endpoint=endpoint)
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            df.to_csv(filepath, index=False)
        return self.df

In [19]:
# Source 2: OMDb
API_KEY = "f3eb77a3"
URL = "http://www.omdbapi.com/?t="

def get_movie_data(url, movie):
    ''' Takes in the name of a movie and returns associated data on the movie.'''
    movie_link = process_movie_name(movie)
    complete_url = url + movie_link + "&apikey=" + API_KEY
    response = requests.get(complete_url) 
    return response.json()

def process_movie_name(movie):
    ''' Takes in the name of a movie and modifies it so that it can be used in API call.'''
    words = movie.split()
    return '+'.join(words)

# get the list of all movies in a year
# get data on all those movies
# save to a json
def get_year_movie_data(movie_titles, url, year):
    empty_data = {}
    data_list = []
    for movie in movie_titles:
        response = get_movie_data(url, movie)
        data_list.append(response)
    with open("MovieData" + year + ".json", 'w') as json_file:
        json.dump(data_list, json_file, indent=4) 

# year = 2010
# while year < 2024:
#     scrapper = ScrapeBoxOffice(year=year, save=True, save_raw=True, output_dir='data')
#     df_box = scrapper.run()
#     movies_year = df_box["Release Group"].tolist()
#     MovieDataCollection.get_year_movie_data(movies_year, "http://www.omdbapi.com/?t=", str(year))
#     year = year + 1

In [None]:
# scrape data for all years
scraper_2011 = ScrapeBoxOffice(year=2011, save=True, save_raw=True, output_dir='data')
df_2011 = scraper_2011.run()

# scraper_2012 = ScrapeBoxOffice(year=2012, save=True, save_raw=True, output_dir='data')
# df_2012 = scraper_2012.run()

# scraper_2013 = ScrapeBoxOffice(year=2013, save=True, save_raw=True, output_dir='data')
# df_2013 = scraper_2013.run()

# scraper_2014 = ScrapeBoxOffice(year=2014, save=True, save_raw=True, output_dir='data')
# df_2014 = scraper_2014.run()

# scraper_2015 = ScrapeBoxOffice(year=2015, save=True, save_raw=True, output_dir='data')
# df_2015 = scraper_2015.run()

# scraper_2016 = ScrapeBoxOffice(year=2016, save=True, save_raw=True, output_dir='data')
# df_2016 = scraper_2016.run()

# scraper_2017 = ScrapeBoxOffice(year=2017, save=True, save_raw=True, output_dir='data')
# df_2017 = scraper_2017.run()

# scraper_2018 = ScrapeBoxOffice(year=2018, save=True, save_raw=True, output_dir='data')
# df_2018 = scraper_2018.run()

# scraper_2019 = ScrapeBoxOffice(year=2019, save=True, save_raw=True, output_dir='data')
# df_2019 = scraper_2019.run()

# scraper_2020 = ScrapeBoxOffice(year=2020, save=True, save_raw=True, output_dir='data')
# df_2020 = scraper_2020.run()

# scraper_2021 = ScrapeBoxOffice(year=2021, save=True, save_raw=True, output_dir='data')
# df_2021 = scraper_2021.run()

# scraper_2022 = ScrapeBoxOffice(year=2022, save=True, save_raw=True, output_dir='data')
# df_2022 = scraper_2022.run()

# scraper_2023 = ScrapeBoxOffice(year=2023, save=True, save_raw=True, output_dir='data')
# df_2023 = scraper_2023.run()

In [33]:
# Merge box office data and movie data
def merge_data(year, box_df):
    '''Merges box office data and movie data.'''
    file_path_movie = "MovieData" + year + ".json"
    df_movie_data = pd.read_json(file_path_movie)
    box_df_bet = box_df.rename(columns={"Release Group": 'Title'})
    master_df = pd.merge(df_movie_data, box_df_bet, on = "Title", how = "inner")
    master_df["Year"] = year
    return master_df

# master_2010 = merge_data("2010", df_2010)
# master_2011 = merge_data("2011", df_2011)
# master_2012 = merge_data("2012", df_2012)
# master_2013 = merge_data("2013", df_2013)
# master_2014 = merge_data("2014", df_2014)
# master_2015 = merge_data("2015", df_2015)
# master_2016 = merge_data("2016", df_2016)
# master_2017 = merge_data("2017", df_2017)
# master_2018 = merge_data("2018", df_2018)
# master_2019 = merge_data("2019", df_2019)
# master_2020 = merge_data("2020", df_2020)
# master_2021 = merge_data("2021", df_2021)
# master_2022 = merge_data("2022", df_2022)
# master_2023 = merge_data("2023", df_2023)

In [None]:
# creating one big dataframe
# dataframes = [master_2010, master_2011, master_2012, master_2013, master_2014, master_2015, master_2016, master_2017,
#               master_2018, master_2019, master_2020, master_2021, master_2022, master_2023]

def merge_dataframes(dfs):
    return pd.concat(dfs, ignore_index=True)

# all_data = merge_dataframes(dataframes)

In [None]:
# cleaning the data
def clean_box_office(df):
    '''Cleans box office sales by removing dollar signs and commas, and drops rows where Domestic value is "-".'''
    # Clean Worldwide column
    df = df[df["Domestic"] != "-"]
    df = df.dropna(subset = ["Worldwide", "Domestic", "Foreign"])
    df["Worldwide"] = (
        df["Worldwide"]
        .astype(str)  
        .str.replace("$", "", regex=False)  
        .str.replace(",", "", regex=False)  
        .astype(int)
    )
    # Clean Domestic column
    df["Domestic"] = (
        df["Domestic"]
        .astype(str)  
        .str.replace("$", "", regex=False) 
        .str.replace(",", "", regex=False)  
    
    )
    # Clean Foreign column
    df["Foreign"] = (
        df["Foreign"]
        .astype(str)  
        .str.replace("$", "", regex=False)  
        .str.replace(",", "", regex=False)  
    )
    # Creating new columns because the raw numbers are too large to process
    df["Worldwide_millions"] = pd.to_numeric(df["Worldwide"]) / 1000000
    df["Domestic_millions"] = pd.to_numeric(df["Domestic"]) / 1000000
    df["Foreign_millions"] = pd.to_numeric(df["Foreign"], errors="coerce") / 1000000
    return df

#cleaned_df = clean_box_office(all_data)

## Part 3:
(2\%) Builds at least two visualizations (graphs/plots) from the data which help to understand or answer the questions of interest. These visualizations will be graded based on how much information they can effectively communicate to readers. Please make sure your visualization are sufficiently distinct from each other.