In [None]:
%load_ext lab_black

Step: Load data

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np

In [None]:
def scrape_table(url, table_id):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find("table", {"id": table_id})
    header_rows = table.thead.find_all("tr")
    header_row = header_rows[-1]  # Use the last header row to get the correct headers
    headers = [th.text for th in header_row.find_all("th") if th.text != "Rk"]

    rows = table.tbody.find_all("tr")
    data = []
    for row in rows:
        if row.get("class") and "thead" in row["class"]:
            continue
        data_row = [
            td.text for td in row.find_all("td") if td.get("data-stat") != "ranker"
        ]
        data.append(data_row)

    return pd.DataFrame(data, columns=headers)


year = 2020
base_url = "https://www.pro-football-reference.com/years/{}/"

In [None]:
# Scrape passing data
passing_url = base_url.format(year) + "passing.htm"
passing_df = scrape_table(passing_url, "passing")
passing_df

In [None]:
# Scrape rushing data
rushing_url = base_url.format(year) + 'rushing.htm'
rushing_df = scrape_table(rushing_url, 'rushing')
rushing_df

In [None]:
# Scrape receiving data
receiving_url = base_url.format(year) + 'receiving.htm'
receiving_df = scrape_table(receiving_url, 'receiving')
receiving_df

In [None]:
# Scrape defense data
defense_url = base_url.format(year) + 'defense.htm'
defense_df = scrape_table(defense_url, 'defense')
defense_df

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


def scrape_table(url, table_id, session):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
    }
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find("table", {"id": table_id})
    header_rows = table.thead.find_all("tr")
    header_row = header_rows[-1]  # Use the last header row to get the correct headers
    headers = [th.text for th in header_row.find_all("th") if th.text != "Rk"]

    rows = table.tbody.find_all("tr")
    data = []
    for row in rows:
        if row.get("class") and "thead" in row["class"]:
            continue
        data_row = [
            td.text for td in row.find_all("td") if td.get("data-stat") != "ranker"
        ]
        data.append(data_row)

    return pd.DataFrame(data, columns=headers)


base_url = "https://www.pro-football-reference.com/years/{}/"

# Initialize empty DataFrames for each category
passing_data = pd.DataFrame()
rushing_data = pd.DataFrame()
receiving_data = pd.DataFrame()
defense_data = pd.DataFrame()

# Create a session
session = requests.Session()

# Iterate through the years and append the data
for year in range(2010, 2023):
    print(f"Processing data for {year}")

    # Scrape passing data
    passing_url = base_url.format(year) + "passing.htm"
    passing_df = scrape_table(passing_url, "passing", session)
    passing_df["Year"] = year
    passing_data = passing_data.append(passing_df, ignore_index=True)

    time.sleep(5)  # Add a delay between requests

    # Scrape rushing data
    rushing_url = base_url.format(year) + "rushing.htm"
    rushing_df = scrape_table(rushing_url, "rushing", session)
    rushing_df["Year"] = year
    rushing_data = rushing_data.append(rushing_df, ignore_index=True)

    time.sleep(5)

    # Scrape receiving data
    receiving_url = base_url.format(year) + "receiving.htm"
    receiving_df = scrape_table(receiving_url, "receiving", session)
    receiving_df["Year"] = year
    receiving_data = receiving_data.append(receiving_df, ignore_index=True)

    time.sleep(5)

    # Scrape defense data
    defense_url = base_url.format(year) + "defense.htm"
    defense_df = scrape_table(defense_url, "defense", session)
    defense_df["Year"] = year
    defense_data = defense_data.append(defense_df, ignore_index=True)

    time.sleep(5)

print("Data processing completed!")

In [18]:
def remove_duplicate_columns(df):
    df = df.loc[:, ~df.columns.duplicated()]
    return df


passing_data = remove_duplicate_columns(df=passing_data)
passing_data.to_parquet("passing_data.parquet")

rushing_data = remove_duplicate_columns(df=rushing_data)
rushing_data.to_parquet("rushing_data.parquet")

receiving_data = remove_duplicate_columns(df=receiving_data)
receiving_data.to_parquet("receiving_data.parquet")

defense_data = remove_duplicate_columns(df=receiving_data)
defense_data.to_parquet("receiving_data.parquet")