# group project: analysis of movie performance: webscraping
### team: rolls before goals 
### github link: https://github.com/nss-data-science-cohort-9/webscraping-apis-movies-rolls-before-goals#

-----------

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
from io import StringIO

### Part 1: Data Gathering

1. Scrape Best Picture Data.  
    * Scrape the [Best Picture wikipedia page](https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture).  
    * Extract for each year:  
        * Year  
        * Film Title  
        * Winner (Yes/No)  
    * Data cleaning tips:  
        * Ensure that year and film title columns are clean and consistent (no footnotes, parentheses, etc.).
        * Save the results as best_picture.csv.  

In [2]:
URL = 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
}

response = requests.get(URL, headers=headers)

#confirm that the response was sucessful
response.status_code
assert response.status_code == 200

#convert it into a soup object
soup = BeautifulSoup(response.text)

In [3]:
#Strategy 1: Pull out all wiki tables, and combine them together

table_html_str= str(soup.findAll('table', attrs={'class' : 'wikitable'}))
year_table= pd.read_html(StringIO(str(soup.findAll('table', attrs={'class' : 'wikitable'}))))
len(year_table)

all_year_tables=pd.DataFrame()
for index, table in enumerate(year_table):
    select_table= year_table[index]
    all_year_tables= pd.concat([all_year_tables, select_table])

#clean up dataframe from merge
all_year_tables= all_year_tables.dropna(subset='Film')
all_year_tables= all_year_tables.dropna(subset='Year of Film Release')
all_year_tables= all_year_tables[['Year of Film Release', 'Film']]
all_year_tables

Unnamed: 0,Year of Film Release,Film
0,1927/28 (1st),Wings
1,1927/28 (1st),7th Heaven
2,1927/28 (1st),The Racket
4,1928/29 (2nd) [a],The Broadway Melody
5,1928/29 (2nd) [a],Alibi
...,...,...
43,2024 (97th),Emilia Pérez
44,2024 (97th),I'm Still Here
45,2024 (97th),Nickel Boys
46,2024 (97th),The Substance


In [4]:
##Strategt 2: filter all table rows based on if they are colored yellow ("winners")
winner_list= soup.find_all('tr', attrs={'style': "background:#FAEB86"})

winner_title=[]
for row in range(0, len(winner_list)): #finx to be enumerate
    winner_title.append(winner_list[row].find_all('td')[0].find_all('a')[0].text)

In [5]:
## Add the winners to the main dataframe based on if they are in the winners list

all_year_tables['winner_status']= all_year_tables['Film'].isin(winner_title)
all_year_tables

###clean up the column rows

#remove things in brackets and parenthesis
all_year_tables['Year of Film Release']= all_year_tables['Year of Film Release'].str.replace(r"\(.*\)", "", regex=True) #regex= remove anyting within parenthesis and the preceeding white space 
all_year_tables['Year of Film Release']= all_year_tables['Year of Film Release'].str.replace(r"\[.*\]", "", regex=True)
all_year_tables

Unnamed: 0,Year of Film Release,Film,winner_status
0,1927/28,Wings,True
1,1927/28,7th Heaven,False
2,1927/28,The Racket,False
4,1928/29,The Broadway Melody,True
5,1928/29,Alibi,False
...,...,...,...
43,2024,Emilia Pérez,False
44,2024,I'm Still Here,False
45,2024,Nickel Boys,False
46,2024,The Substance,False


In [6]:
### Save the dataframe to the 'data' subfolder on the repository 
all_year_tables.to_csv('../data/best_pictures.csv')