In [166]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import re
from dateutil import parser


### Define Fetch Game Log Function
* This function reads the html content for any player for any year using the baseball-reference site

In [167]:
def fetch_game_log(player_id, year):
    # Construct the URL for the player's game log for the given year
    url = f'https://www.baseball-reference.com/players/gl.fcgi?id={player_id}&t=b&year={year}'
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch data for {player_id} in {year}")
        return None
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the table containing the game logs
    table = soup.find('table', {'id': 'batting_gamelogs'})
    
    # Check if the table is found
    if table is None:
        print(f"No data found for {player_id} in {year}")
        return None
    
    # Read the table into a pandas DataFrame
    df = pd.read_html(str(table))[0]
    
    # Remove rows where 'Rk' is not a number (header rows that repeat in the table)
    df = df[pd.to_numeric(df['Rk'], errors='coerce').notnull()]
    
    # Add the year to the 'Date' column if the year is not already present
    df['Date'] = df['Date'].apply(lambda x: f"{x}, {year}" if '(' not in x else x)
    
    # Extract the value from parentheses (if present) and assign it to a new column 'dbl'
    df['dbl'] = df['Date'].str.extract(r'\((\d+)\)').astype(float)
    
    # Add the year to the 'Date' column for doubleheader dates
    df.loc[df['dbl'].notnull(), 'Date'] = df['Date'] + ', ' + str(year)
    
    # Format 'Date' to 'game_date' in YYYY-MM-DD format
    df['game_date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
    
    return df


### Define Clean_date Function
* This function cleans the dates in the fetched data

In [168]:
# Function to clean and parse dates
def clean_date(date_str):
    try:
        # Replace invisible characters like U+00A0 with a space
        date_str = date_str.replace('\xa0', ' ')
        # Remove any null characters and non-printable characters
        date_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', date_str)
        # Remove unwanted characters and extra text like "(1)" or "susp"
        date_str = ''.join(char for char in date_str if char.isalnum() or char.isspace() or char == ',')
        # Remove specific unwanted words like "susp"
        date_str = date_str.replace('susp', '').strip()
        # Parse the cleaned string to a date object
        parsed_date = parser.parse(date_str)
        # Force the year to be 2021
        parsed_date = parsed_date.replace(year=2021)
        # Format the date to 'YYYY-MM-DD'
        return parsed_date.strftime('%Y-%m-%d')
    except Exception as e:
        # Print the error for debugging purposes
        print(f"Error parsing date '{date_str}': {e}")
        # Handle any parsing errors by returning None
        return None





### This fetches each players data (year-by-year)

In [165]:
idlist = pd.read_csv('batter_ids.csv')
batter_ids = idlist.key_bbref

game_pks = pd.read_csv('game_pks.csv')

# Define the years you want to process
years = [2021, 2022, 2023, 2024]

for id in batter_ids:

    # Initialize an empty dictionary to store dataframes for each player
    player_data = {}
    player_df = pd.DataFrame()  # Initialize an empty dataframe for the player

    for year in years:

        # Fetch Data for Player and year
        df = fetch_game_log(id, year)

        # If no data for current year
        if df is None:
            continue # Skip this year 

        # Apply the function to the date_column and create a new column
        df['game_date'] = df['Date'].apply(clean_date)
        df['Date'] = df['game_date']

        # Ensure the 'Date' column in df and 'game_date' column in game_pks are in datetime format
        df['Date'] = pd.to_datetime(df['Date'])
        game_pks['game_date'] = pd.to_datetime(game_pks['game_date'])

        # Initialize a new column in df for game_id
        df['game_id'] = None
                
        # Iterate over the rows in df to find the corresponding game_id in game_pks
        for index, row in df.iterrows():
            # Filter the game_pks for the matching date
            game_day_matches = game_pks[game_pks['game_date'] == row['Date']]
            
            # Check the 'dbl' column to assign the correct game_id
            if not game_day_matches.empty:
                if row['dbl'] == 1:
                    # For the first game of a double-header
                    game_id = game_day_matches.iloc[0]['game_id']
                elif row['dbl'] == 2:
                    # For the second game of a double-header
                    game_id = game_day_matches.iloc[1]['game_id']
                else:
                    # For days without double-headers or unmarked double-headers, take the first game
                    game_id = game_day_matches.iloc[0]['game_id']
                df.at[index, 'game_id'] = game_id

        player_df = pd.concat([player_df, df])  # Append the data for the year to player_df

    # Store the concatenated dataframe for the player in the dictionary
    player_data[id] = player_df
        
    # Save the player's data to a CSV file
    player_df.to_csv(f'batters/{id}_batting.csv')


  df = pd.read_html(str(table))[0]


No data found for abbotco01 in 2022
No data found for abbotco01 in 2023
No data found for abbotco01 in 2024
No data found for abramcj01 in 2021


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


No data found for abreuwi02 in 2021
No data found for abreuwi02 in 2022


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


No data found for adamsjo03 in 2021
No data found for adamsjo03 in 2022


  df = pd.read_html(str(table))[0]


No data found for adamsjo03 in 2024


  df = pd.read_html(str(table))[0]


No data found for adamsma01 in 2022
No data found for adamsma01 in 2023
No data found for adamsma01 in 2024


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


No data found for adonjo01 in 2022
No data found for adonjo01 in 2023
No data found for adonjo01 in 2024


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


IndexError: single positional indexer is out-of-bounds

In [152]:
df.to_csv(f'{id}_{year}.csv')

In [154]:
df.to_csv(f'{id}_{year}.csv')

In [156]:
df.to_csv(f'{id}_{year}.csv')