# Project 3: Webscraping Data Analysis
## Makayla Marshall
## 4/11/2023

### Imports:

In [1]:
# webscraping imports
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

### Request:

In [2]:
def get_response(url):
    response = requests.get(url) #use requests to get response from url
    status = response.status_code #get the status of the response 
    if status == 200: #if it's all good (status of response)
        page = response.text
        soup = bs(page)
        return soup
    else:
        print(f"Oops! Received status code {status}")
        return 'Oops'

### Get Info:

In [3]:
def get_world_list(soup):
    mov_list = []
    mov_table = soup.find('table')
    for movies in mov_table:
        #get name of movie:
        movie = movies.find('td', class_='a-text-left mojo-field-type-release_group')

        #get the 3 different profit types (international, domestic, foreign):
        profits = movies.find_all('td', class_='a-text-right mojo-field-type-money')

        #save the three types into a list called money:
        money = []
        for profit in profits:
            money.append(profit.text)


        #skip saving it to the soon-to-be-data-frame if it's the first row, (kept causing errors because it's a nonetype):
        if type(movie) != type(None): #save it to the soon-to-be-data-frame:
            mov_list.append({'movie':movie.text,'international-profit':money[0], 'domestic-profit':money[1], 'foreign-profit':money[2]})
    return mov_list

In [4]:
def get_dom_list(soup):
    mov_list = []
    mov_table = soup.find('table')
    for movies in mov_table:
        #get name of movie:
        movie = movies.find('td', class_='a-text-left mojo-field-type-release mojo-cell-wide')
        gross = movies.find('td', class_='a-text-right mojo-field-type-money')
        date = movies.find('td', class_='a-text-left mojo-header-column mojo-truncate mojo-field-type-date_interval mojo-sort-column')

        
        if type(movie) != type(None):#save it to the soon-to-be-data-frame if it isn't a nonetype (is the first row / label)
            mov_list.append({'movie':movie.text, 'date':date.text, 'gross':gross.text})
    return mov_list

In [5]:
def get_dom2_list(soup):
    mov_list = []
    mov_table = soup.find('table')
    for movies in mov_table:
        #get name of movie:
        movie = movies.find('td', class_='a-text-left mojo-field-type-release mojo-cell-wide')
        gross = movies.find('td', class_='a-text-right mojo-field-type-money mojo-estimatable')
        date = movies.find('td', class_='a-text-left mojo-header-column mojo-truncate mojo-field-type-named_interval mojo-sort-column')
        
        if type(movie) != type(None):#save it to the soon-to-be-data-frame if it isn't a nonetype (is the first row / label)
            mov_list.append({'movie':movie.text, 'date':date.text, 'gross':gross.text})
    return mov_list

In [6]:
def get_dom3_list(soup):
    mov_list = []
    mov_table = soup.find('table')
    for movies in mov_table:
        #get name of movie:
        movie = movies.find('td', class_='a-text-left mojo-field-type-release mojo-cell-wide')
        gross = movies.find('td', class_='a-text-right mojo-field-type-money')
        date = movies.find('td', class_='a-text-left mojo-header-column mojo-truncate mojo-field-type-year mojo-sort-column')
        if type(movie) != type(None):#save it to the soon-to-be-data-frame if it isn't a nonetype (is the first row / label)
            mov_list.append({'#1 movie':movie.text, 'year':date.text, 'total gross':gross.text})
    return mov_list

## Make Info List into a Data Frame:

In [7]:
#save space
def retrieve_frame(url, second_phase):
    return pd.DataFrame(second_phase(get_response(url)))

In [8]:
#each line creates a new dataframe from one website
Worldwide2023 = retrieve_frame('https://www.boxofficemojo.com/year/world/?ref_=bo_nb_hm_tab', get_world_list)
DomesticDaily2023 = retrieve_frame('https://www.boxofficemojo.com/date/?ref_=bo_nb_wey_secondarytab', get_dom_list)
DomesticWeekly2023 = retrieve_frame('https://www.boxofficemojo.com/weekly/?ref_=bo_nb_di_secondarytab', get_dom_list)
DomesticMonthly2023 = retrieve_frame('https://www.boxofficemojo.com/month/by-year/?grossesOption=calendarGrosses', get_dom2_list)
DomesticYearly = retrieve_frame('https://www.boxofficemojo.com/year/?ref_=bo_nb_di_secondarytab', get_dom3_list)

## Save it as a csv to access in the analysis part

In [9]:
def make_csv(file, csv):
    file.to_csv(csv)

In [10]:
#each line creates a new csv
make_csv(Worldwide2023, 'Worldwide2023.csv')
make_csv(DomesticDaily2023, 'DomesticDaily2023.csv')
make_csv(DomesticWeekly2023, 'DomesticWeekly2023.csv')
make_csv(DomesticMonthly2023, 'DomesticMonthly2023.csv')
make_csv(DomesticYearly, 'DomesticYearly.csv')