In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('../data/wiki_movie_plots_deduped.csv')
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [None]:
# Filter for American movies released in or after 2010
df_american_2010 = df[
    (df['Release Year'] >= 2010) 
    & 
    (df['Origin/Ethnicity'] == 'American')
]

# Show the filtered DataFrame
df_american_2010.head()


In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np

def get_budget_box_office(url):
    # Make a request to the Wikipedia page
    r = requests.get(url)
    if r.status_code != 200:
        return np.nan, np.nan
    
    # Parse the page content
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # Scrape the "Budget" value
    try:
        th_budget = soup.find("table").find('th', class_='infobox-label', string="Budget")
        budget_value = th_budget.find_next('td', class_='infobox-data').text if th_budget else np.nan
    except AttributeError:
        budget_value = np.nan
    
    # Scrape the "Box Office" value
    try:
        th_box_office = soup.find("table").find('th', class_='infobox-label', string="Box office")
        box_office_value = th_box_office.find_next('td', class_='infobox-data').text if th_box_office else np.nan
    except AttributeError:
        box_office_value = np.nan
    
    return budget_value, box_office_value

# Apply the get_budget_box_office function to the 'Wiki Page' column
df_american_2010['Budget'], df_american_2010['Box Office'] = zip(*df_american_2010['Wiki Page'].apply(get_budget_box_office))

# Show the updated DataFrame
df_american_2010.head()


In [None]:
import re

# Define regex pattern to handle monetary ranges and scales
pattern = r'\$?([\d,.]+)\s?(?:–\$?([\d,.]+))?\s?(million|billion)?'

def extract_value(text):
    # Remove parentheses and content within, as well as references like [1]
    text = re.sub(r'\(.*?\)|\[.*?\]', '', text)
    
    matches = re.findall(pattern, text)
    
    if not matches:
        return None  # Return None if no matches
    
    extracted_values = []
    
    for match in matches:
        low_str, high_str, scale = match
        
        # Handle ranges like $40–$45
        if high_str:
            low = float(low_str.replace(',', ''))
            high = float(high_str.replace(',', ''))
            number = (low + high) / 2  # Take the average of the range
        else:
            number = float(low_str.replace(',', ''))
        
        # Convert to appropriate scale
        if scale == 'million':
            number *= 1_000_000
        elif scale == 'billion':
            number *= 1_000_000_000
        
        extracted_values.append(number)
    
    # Return the first extracted value (if there are multiple values like gross/net)
    return extracted_values[0] if extracted_values else None

# Apply the cleaning function to the 'Budget' and 'Box Office' columns
df_american_2010['Clean_Budget'] = df_american_2010['Budget'].apply(extract_value) / 1_000_000  # In millions
df_american_2010['Clean_Box_Office'] = df_american_2010['Box Office'].apply(extract_value) / 1_000_000  # In millions

# Show the DataFrame with cleaned values
df_american_2010.head()
