# Part 1: Data Gathering
<b>Scrape the Best Picture wikipedia page.</b>
<br>
<br>
Extract for each year:
Year
Film Title
Winner (Yes/No)
Data cleaning tips:
Ensure that year and film title columns are clean and consistent (no footnotes, parentheses, etc.).
Save the results as best_picture.csv.

In [1]:
# IMPORT BUILT-IN LIBRARIES
import re
import requests

# IMPORT 3RD-PARTY LIBRARIES
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from IPython.core.display import HTML

In [2]:
# Scrape the Best Picture wikipedia page.
url = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
}
# SEND REQUEST
response = requests.get(url=url, headers=headers)
response.status_code

200

In [3]:
# CREATE BEAUTIFUL SOUP OBJECT
soup = BeautifulSoup(markup=response.text, features="html.parser")

In [4]:
# INITIALIZE AN EMPTY DICT TO STORE MOVIE DATA
movies = {}

# FILTER ALL THE TABLE ELEMTNS
table_list = soup.find_all(name="table", attrs={"class": ['wikitable', 'sortable', 'sticky-header']})

# LOOP THROUGH THE LIST OF TABLE ELEMENTS
for table in table_list:
    if table == table_list[-2]:
        # IGNORE LAST 2 TABLES
        break

    # FILTER ALL THE TABLE ROW ELEMENTS
    table_row_list = table.find_all(name="tr")

    # LOOP THROUGH THE LIST OF TABLE ROW ELEMENTS
    for row in table_row_list:

        # LOOP THROUGH THE ROW CHILD ELEMENTS
        for child in row.children:

            # CHECK IF THE ROW CHILD ELEMENT IS A TABLE HEADER ELEMENT AND HAS AN ANCHOR TAG
            if child.name == "th" and child.a:

                # MATCH THE TABLE HEADER ELEMENT TITLE ATTRIBUTE WITH THE REGEX PATTERN
                match_result = re.match(pattern=r"(\d{4}) in film", string=child.a.attrs['title'])

                # CHECK IF THE REGEX EXPRESSION FOUND A MATCH
                if match_result:

                    # STORE THE MATCH AS A VARIABLE FOR YEAR
                    year = match_result.group(1)

                    # ADD A KEY OF year TO THE movies DICT AND INITIALIZE THE VALUE AS AN EMPTY LIST
                    movies[year] = []

            # CHECK IF THE ROW CHILD ELEMENT IS A TABLE DATA ELEMENT AND HAS AN I TAG
            elif child.name == "td" and child.i:

                # APPEND THE CHILD TEXT TO THE ASSOCIATED YEAR IN THE MOVIES DICT
                movies[year].append(child.text.strip())

# CREATE A DATAFRAME FROM THE DICT
movies_df = (
    pd.DataFrame(data=movies.items(), columns=['Year', 'Film Title'])
    .explode("Film Title")
    .reset_index(drop=True)
)

In [5]:
# INITIALIZE WINNER LIST
winner_list = []

# FILTER YELLOW TABLE ROWS
winner_row_list = soup.find_all(name="tr", attrs={"style": "background:#FAEB86"})

# LOOP THROUGH ROW LIST
for row in winner_row_list:

    # APPEND MOVIE TO LIST
    winner_list.append(row.td.text.strip())

In [6]:
# ADD WINNER COLUMN TO DATAFRAME
movies_df.insert(loc=0, column="Winner", value="")

In [7]:
# UPDATE WINNER COLUMN
for index, film in enumerate(movies_df['Film Title']):
    if film in winner_list:
        movies_df.loc[index, 'Winner'] = "Yes"

    else:
        movies_df.loc[index, 'Winner'] = "No"

movies_df

Unnamed: 0,Winner,Year,Film Title
0,Yes,1928,Wings
1,No,1928,7th Heaven
2,No,1928,The Racket
3,Yes,1929,The Broadway Melody
4,No,1929,Alibi
...,...,...,...
606,No,2024,Emilia Pérez
607,No,2024,I'm Still Here
608,No,2024,Nickel Boys
609,No,2024,The Substance


In [8]:
# SAVE DATAFRAME TO CSV
movies_df.to_csv("../data/best_pictures.csv")