#### Bonus 1: Scraping Wikipedia for Best Actor and Best Actress Data

* Scrape the following Wikipedia pages:
    * Best Actor: https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor
    * Best Actress: https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress
* Each page contains tables of winners and nominees by year.
* Extract the following columns:
    * Year
    * Actor/Actress Name
    * Film Title
    * Winner (Yes/No)
* Data cleaning tips:
    * Remove footnote markers from names and movie titles.
    * Ensure that you save just the release year (eg. 2009 instead of 2009 (82nd))
    * Store the cleaned data as two csv files:
        * best_actor.csv
        * best_actress.csv

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from pathlib import Path 

In [26]:
# Define a function that can take in either the 'Best Actor' or 'Best Actress' Wikipedia page and a csv name
# The function scrapes the information at the URL and populates a csv with the information
def get_actor_info(url, csv_name):
    
    URL = url
    
    headers = {
        "User-Agent": "Actor_Agent"
    }
    
    response = requests.get(URL, headers=headers)
    soup = BeautifulSoup(response.text)

    # Get all wikitables
    all_tables = soup.findAll('table', attrs={'class' : 'wikitable'})
    
    # Find just the tables with movie data by filtering on the 'Role(s)' column header
    # This excludes the tables at the bottom of the page in the 'Multiple awards and nominations' section
    actor_tables = [table for table in all_tables if 'Role(s)' in table.find('tr').text]
    
    # Create empty list to store dictionaries of actor data, ex. [{'Year': '1928', 'Actor_Name': 'Emil Jannings', 'Film_Title': 'The Last Command', 'Winner':'Yes'}]
    actor_info = []
    
    # Iterate through all tables to extract movie data
    for table in actor_tables:
        
        # Find all 'tr' tags ('table row'), skipping the first one since it just contains column headers
        rows = table.findAll('tr')[1:]
        
        # Iterate through all rows of the table to find year, movie, and winner status
        for row in rows:
            
            # If the row contains a 'th' ('table header') tag, extract the year and store it in a variable
            if len(row.findAll('th')) != 0:
                awards_year = row.find('th').text.strip()
                # If the awards_year contains a slash, only grab the later year
                if '/' in awards_year:
                    awards_year = awards_year[:2] + awards_year[5:7]
                # Remove anything in parentheses, if needed
                if '(' in awards_year:
                    awards_year = re.match(r'(\d{4})\s?.*', awards_year).groups(1)[0]
    
            # Get all columns in the row
            columns = row.findAll('td')
    
            # Drop the 'ref' column, if it exists
            if len(columns) > 3:
                columns = columns[0:3]
            
            # Get the actor name in this row, if there is one
            if len(columns) >= 3:
                actor_name = columns[0].text.strip()
                # Remove footnote markers, if there are any
                if len(re.findall(r'[\[\]]', actor_name)) != 0:
                    actor_name = re.match(r'(.*)(?:\[.*\])', actor_name).groups(1)[0]
                # Remove anything in parentheses, if there are any
                if len(re.findall(r'[\(\)]', actor_name)) != 0:
                    actor_name = re.match(r'(.*)(?:\(.*\))', actor_name).groups(1)[0]
                # Strip special characters
                actor_name = actor_name.strip('§†‡ ')
                    
            # Get the film title name in this row
            film_title = columns[-1].text.strip()
                
            # Get the winner status by seeing if the row has a yellow background
            if len(row.findAll('td', attrs={'style' : 'background:#FAEB86;'})) != 0:
                winner='Yes'
            else:
                winner='No'
            
            # Append the movie info to the actor_info list
            actor_info_dict = {'Year': awards_year, 'Actor_Name': actor_name, 'Film_Title': film_title, 'Winner': winner}
            actor_info.append(actor_info_dict)

            # Convert actor_info to a pandas DataFrame
            actor_info_df = pd.DataFrame(actor_info)

            # Add a 'Nominated' column to indicate all of these actors were nominated for an award
            actor_info_df['Nominated'] = 'Yes'

            # Write the actor_info_df DataFrame to a csv file in the data folder
            filepath = Path(f'../data/{csv_name}')  
            filepath.parent.mkdir(parents=True, exist_ok=True)  
            actor_info_df.to_csv(filepath, index=False)

In [27]:
get_actor_info('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor','best_actor.csv')

In [28]:
get_actor_info('https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress','best_actress.csv')