#### Question 1: Scraping Best Picture Data from Wikipedia

* Scrape the Best Picture wikipedia page.  
    * Extract for each movie:  
        * Year. 
        * Film Title. 
        * Winner (Yes/No). 
* Data cleaning tips:  
    * Ensure that year and film title columns are clean and consistent (no footnotes, parentheses, etc.).  
    * Save the results as best_picture.csv.  


In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from pathlib import Path 

In [26]:
URL = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'

headers = {
    "User-Agent": "Movie_Agent"
}

response = requests.get(URL, headers=headers)
soup = BeautifulSoup(response.text)

In [25]:
# Find Best Picture winners by searching for table rows with yellow background
winners = soup.findAll('tr', attrs={'style' : 'background:#FAEB86'})
winning_titles = [winner.td.text.strip() for winner in winners]

In [5]:
# Get all wikitables
all_tables = soup.findAll('table', attrs={'class' : 'wikitable'})

# Find just the tables with movie data by filtering on the 'Year of Film Release' column header
# This excludes the 2 wikitables at the very bottom of the page ('Age superlatives' and 'Production companies and distributors with multiple nominations and wins')
movie_tables = [table for table in all_tables if 'Year of Film Release' in table.find('tr').text]

In [36]:
# Create empty list to store dictionaries of movie data, ex. [{'Title': 'Wings', 'Movie_Year': '1927', 'Awards_Year': '1928', 'Winner':'True'}]
movie_info = []

# Iterate through all tables to extract movie data
for table in movie_tables:
    
    # Find all 'tr' tags ('table row'), skipping the first one since it just contains column headers
    rows = table.findAll('tr')[1:]
    
    # Iterate through all rows of the table to find year, movie, and winner status
    for row in rows:
        
        # If the row contains a 'th' ('table header') tag, extract the year and store it in a variable
        if len(row.findAll('th')) != 0:
            awards_year = row.th.a.text

            # If the awards_year contains a slash, only grab the later year
            if '/' in awards_year:
                awards_year = awards_year[:2] + awards_year[-2:]
        
        # Get the movie title in this row, if there is one
        if len(row.findAll('td')) != 0:
            title = row.td.text.strip()
        else:
            title = ''
            
        # Get the winner status by seeing if the background is yellow
        if row.has_attr('style'):
            if row['style']=='background:#FAEB86':
                winner='Yes'
            else:
                winner='No'
        else:
            winner='No'
        
        # If this row has a movie title, append the movie info to the movie_info list
        if title != '':
            movie_info_dict = {'Title': title, 'Awards_Year': awards_year, 'Winner': winner}
            movie_info.append(movie_info_dict)

In [38]:
# Convert movie_info to a pandas DataFrame
movie_info_df = pd.DataFrame(movie_info)
movie_info_df

Unnamed: 0,Title,Awards_Year,Winner
0,Wings,1928,Yes
1,7th Heaven,1928,No
2,The Racket,1928,No
3,The Broadway Melody,1929,Yes
4,Alibi,1929,No
...,...,...,...
606,Emilia Pérez,2024,No
607,I'm Still Here,2024,No
608,Nickel Boys,2024,No
609,The Substance,2024,No


In [39]:
# Write the movie_info_df DataFrame to a csv file in the data folder
filepath = Path('../data/best_picture.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
movie_info_df.to_csv(filepath)  