In [30]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import os
import glob
import shutil

Inicializando

In [31]:
year = 2023

# TODO: aprimorar a estrutura do dicionário para reutilização
# Lista de Times
teams = {'buf': 'BUF', 'mia': 'MIA', 'nwe': 'NWE', 'nyj': 'NYJ',
'pit': 'PIT', 'rav': 'BAL', 'cin': 'CIN', 'cle': 'CLE',
'htx': 'HOU', 'clt': 'IND', 'oti': 'TEN', 'jax': 'JAX', 
'kan': 'KAN', 'sdg': 'LAC', 'den': 'DEN', 'rai': 'LVR',
'phi': 'PHI', 'was': 'WAS', 'dal': 'DAL', 'nyg': 'NYG',
'det': 'DET', 'min': 'MIN', 'gnb': 'GNB', 'chi': 'CHI',
'atl': 'ATL', 'tam': 'TAM', 'nor': 'NOR', 'car': 'CAR',
'crd': 'ARI', 'ram': 'LAR', 'sea': 'SEA', 'sfo': 'SFO'}

SEASON = f"data/raw/{year}"
AWARDS = f"data/raw/{year}/Awards"
ROSTERS = f"data/raw/{year}/Rosters"
STATS_PLAYER = f"data/raw/{year}/Stats/Player"
STATS_TEAM = f"data/raw/{year}/Stats/Team"

URL_ROSTER = 'https://www.pro-football-reference.com/teams/{}/{}_roster.htm'

URL_TEAM_OFFENSE = f"https://www.pro-football-reference.com/years/{year}/"
URL_TEAM_DEFENSE = f"https://www.pro-football-reference.com/years/{year}/opp.htm"

URL_PASSING = f"https://www.pro-football-reference.com/years/{year}/passing.htm"
URL_RUSHING = f"https://www.pro-football-reference.com/years/{year}/rushing.htm"
URL_RECEIVING = f"https://www.pro-football-reference.com/years/{year}/receiving.htm"
URL_DEFENSE = f"https://www.pro-football-reference.com/years/{year}/defense.htm"
URL_KICKING = f"https://www.pro-football-reference.com/years/{year}/kicking.htm"
URL_PUNTING = f"https://www.pro-football-reference.com/years/{year}/punting.htm"
URL_RETURNS = f"https://www.pro-football-reference.com/years/{year}/returns.htm"

URL_DRAFT = f"https://www.pro-football-reference.com/years/{year}/draft.htm"

URL_ALL_PRO = f"https://www.pro-football-reference.com/years/{year}/allpro.htm"
URL_PRO_BOWL = f"https://www.pro-football-reference.com/years/{year}/probowl.htm"

URL_COACHES = f"https://www.pro-football-reference.com/years/{year}/coaches.htm"
URL_SCHEDULES = f"https://www.pro-football-reference.com/years/{year}/games.htm"

folders = [
    AWARDS,
    ROSTERS,
    STATS_PLAYER,
    STATS_TEAM,
]

Funções

In [32]:
# ------------------------------------------------------------------------------------------------
def scraping_data(url, attrs, file, is_data_append_csv = False, is_append_team = False):
    driver = webdriver.Chrome()
    url = url
    driver.get(url)
    time.sleep(2)

    # Lista para armazenar os dados
    lista = []

    rows = driver.find_elements(By.XPATH, f'//*[contains(@id,"{attrs}")]//tr[not(contains(@class, "over_header"))]')

    # Iterar pelas linhas da tabela

    for row in rows:
        # Capturar os dados da linha
        data = [item.text for item in row.find_elements(By.XPATH, ".//*[self::td or self::th]")]
        
        # Capturar o link (se existir) do jogador
        if is_data_append_csv:
            # data-append-csv
            append_csv = row.find_elements(By.XPATH, ".//td[@data-append-csv]")
            player_id = append_csv[0].get_dom_attribute('data-append-csv') if append_csv else ""
        
        # Adicionar o identificador ao final da linha de dados
        if data:  # Evitar adicionar linhas vazias
            if is_data_append_csv:
                if len(lista) == 0:
                    data.append('Player-additional')
                else:
                    data.append(player_id)

            lista.append(data)

    # Converter os dados para um DataFrame
    df = pd.DataFrame(lista)
    
    df.columns = df.iloc[0]  # Define a segunda linha como cabeçalho
    df = df[1:]  # Exclui as duas primeiras linhas
    df = df.reset_index(drop=True)  # Reseta o índice

    if is_append_team:
        df["Team"] = file
        
    df = df.drop_duplicates()

    # Salvar como CSV
    df.to_csv(f'{file}.csv', index=False)

    driver.quit()

# ------------------------------------------------------------------------------------------------
def move_files(destination):
    files = glob.glob("*.csv")

    for file in files:
        if os.path.exists(file):
            shutil.move(file, destination)

# ------------------------------------------------------------------------------------------------
def concat_dfs():
    files = glob.glob("*.csv")
    dfs = []

    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)
        
    df_new = pd.concat(dfs,ignore_index=True)
    df_new.to_csv("rosters.csv", index=False)

    shutil.move('rosters.csv', SEASON)

    move_files(ROSTERS)
    
# ------------------------------------------------------------------------------------------------
def recreate_folders():
    for folder in folders:
        if os.path.exists(folder):
            shutil.rmtree(folder)
        os.makedirs(folder, exist_ok=True)


Scraping de Roster

In [33]:
recreate_folders()

In [34]:

for key, value in teams.items():
    url = URL_ROSTER.format(key, year)
    scraping_data(url,attrs='roster', is_data_append_csv=True, file=value, is_append_team=True)

concat_dfs()

Team Stats (Offense)

In [35]:

scraping_data(URL_TEAM_OFFENSE,'team_stats', 'offense')
scraping_data(URL_TEAM_OFFENSE,'passing', 'passing')
scraping_data(URL_TEAM_OFFENSE,'rushing', 'rushing')
scraping_data(URL_TEAM_OFFENSE,'returns', 'returns')
scraping_data(URL_TEAM_OFFENSE,'kicking', 'kicking')
scraping_data(URL_TEAM_OFFENSE,'punting', 'punting')
scraping_data(URL_TEAM_OFFENSE,'drives', 'off_drives')

move_files(STATS_TEAM)


Team Stats (Defense)

In [36]:

scraping_data(URL_TEAM_DEFENSE,'team_stats', 'defense')
scraping_data(URL_TEAM_DEFENSE,'drives', 'def_drives')

move_files(STATS_TEAM)

Player Stats

In [37]:
scraping_data(URL_PASSING, 'passing', 'passing', is_data_append_csv=True)
scraping_data(URL_RUSHING, 'rushing', 'rushing', is_data_append_csv=True)
scraping_data(URL_RECEIVING, 'receiving', 'receiving', is_data_append_csv=True)
scraping_data(URL_DEFENSE, 'defense', 'defense', is_data_append_csv=True)
scraping_data(URL_KICKING, 'kicking', 'kicking', is_data_append_csv=True)
scraping_data(URL_PUNTING, 'punting', 'punting', is_data_append_csv=True)
scraping_data(URL_RETURNS, 'returns', 'returns', is_data_append_csv=True)

move_files(STATS_PLAYER)

Drafts

In [38]:
scraping_data(URL_DRAFT, 'drafts', 'draft', is_data_append_csv=True)

move_files(SEASON)

Awards

In [39]:
scraping_data(URL_ALL_PRO, 'all_pro', 'AP', is_data_append_csv=True)
scraping_data(URL_PRO_BOWL, 'pro_bowl', 'PB', is_data_append_csv=True)

move_files(AWARDS)

Coaches

In [40]:
scraping_data(URL_COACHES, 'coaches', 'coaches')

move_files(SEASON)

Schedules

In [41]:
scraping_data(URL_SCHEDULES, 'games', 'games')

move_files(SEASON)