# BATTERS

In [None]:
import requests
import pandas as pd
from datetime import datetime
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# --- CONFIG ---
current_year = datetime.now().year
years = list(range(2021, current_year + 1))
schedule_cache = {}
os.makedirs("batters", exist_ok=True)

# --- Load batter IDs ---
batter_df = pd.read_csv("batter_ids.csv")

# --- Helper: fetch schedule for a given team & season ---
def get_schedule(team_id, season):
    key = (team_id, season)
    if key in schedule_cache:
        return schedule_cache[key]

    url = "https://statsapi.mlb.com/api/v1/schedule"
    params = {"teamId": team_id, "season": season, "sportId": 1}
    res = requests.get(url, params=params).json()
    sched = {}
    for date in res['dates']:
        for g in date['games']:
            gid = int(g['gamePk'])
            away = g['teams']['away']['team']['id']
            home = g['teams']['home']['team']['id']
            opp = home if away == team_id else away
            sched[gid] = {'Team': team_id, 'Opp': opp, 'away_id': away, 'home_id': home}
    schedule_cache[key] = sched
    return sched

# --- Helper: fetch full log per player ---
def fetch_player_log(player_id):
    all_rows = []

    for season in years:
        url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
        params = {"stats": "gameLog", "group": "hitting", "season": season}
        res = requests.get(url, params=params).json()

        if not res['stats'] or not res['stats'][0]['splits']:
            continue

        rows = []
        team_ids_used = set()

        for game in res['stats'][0]['splits']:
            row = game['stat']
            game_id = int(game['game']['gamePk'])
            game_date = game['date']
            team_id = game['team']['id']
            row['game_id'] = game_id
            row['game_date'] = game_date
            row['team_id'] = team_id
            team_ids_used.add(team_id)
            row['season'] = season
            rows.append(row)

        df = pd.DataFrame(rows)
        df['game_id'] = df['game_id'].astype(int)

        column_map = {
            'game_date': 'game_date',
            'game_id': 'game_id',
            'atBats': 'AB', 'hits': 'H', 'runs': 'R', 'doubles': '2B', 'triples': '3B',
            'homeRuns': 'HR', 'rbi': 'RBI', 'baseOnBalls': 'BB', 'strikeOuts': 'SO',
            'stolenBases': 'SB', 'caughtStealing': 'CS', 'hitByPitch': 'HBP',
            'sacBunts': 'SH', 'sacFlies': 'SF', 'intentionalWalks': 'IBB',
            'totalBases': 'TB', 'avg': 'BA', 'obp': 'OBP', 'slg': 'SLG', 'ops': 'OPS',
            'plateAppearances': 'PA'
        }
        df = df.rename(columns=column_map)

        df['Team'] = None
        df['Opp'] = None
        df['away_id'] = None
        df['home_id'] = None

        for tid in team_ids_used:
            schedule = get_schedule(tid, season)
            for gid, game_row in df[df['team_id'] == tid].iterrows():
                info = schedule.get(game_row['game_id'], {})
                df.at[gid, 'Team'] = info.get('Team')
                df.at[gid, 'Opp'] = info.get('Opp')
                df.at[gid, 'away_id'] = info.get('away_id')
                df.at[gid, 'home_id'] = info.get('home_id')

        df['game_date'] = pd.to_datetime(df['game_date'])
        df = df.sort_values(['game_date', 'game_id'])
        dbl_counts = df.groupby('game_date').cumcount() + 1
        dbl_flags = df.groupby('game_date')['game_id'].transform('count')
        df['dbl'] = dbl_flags.where(dbl_flags > 1, None)
        df.loc[df['dbl'].notnull(), 'dbl'] = dbl_counts[df['dbl'].notnull()].astype(float)

        all_rows.append(df)

    if all_rows:
        final_df = pd.concat(all_rows)
        ordered = [
            'game_date', 'game_id', 'Team', 'Opp', 'away_id', 'home_id',
            'PA', 'AB', 'H', 'R', '2B', '3B', 'HR', 'RBI',
            'BB', 'SO', 'SB', 'CS', 'HBP', 'SH', 'SF', 'IBB', 'TB',
            'BA', 'OBP', 'SLG', 'OPS', 'dbl'
        ]
        final_df = final_df[ordered]
        return final_df
    return None

# --- Threaded Worker ---
def process_batter(row):
    player_id = int(row['mlbID'])
    bbref_id = row['key_bbref']
    try:
        df = fetch_player_log(player_id)
        if df is not None:
            filename = f"batters/{bbref_id}_batting.csv"
            df.to_csv(filename, index=False)
            return f"Saved {filename}"
        else:
            return f"No data for {bbref_id}"
    except Exception as e:
        return f"Error processing {bbref_id}: {e}"

# --- Run Thread Pool ---
max_threads = 4
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    futures = [executor.submit(process_batter, row) for _, row in batter_df.iterrows()]
    for future in as_completed(futures):
        print(future.result())

print(f"Processed {len(batter_df)} batters")


Saved batters/acunajo01_batting.csv
Saved batters/adamsma01_batting.csv
Saved batters/adamsjo03_batting.csv
Saved batters/abreuwi02_batting.csv
Saved batters/adriaeh01_batting.csv
Saved batters/abramcj01_batting.csv
Saved batters/acunaro01_batting.csv
Saved batters/abreujo02_batting.csv
Saved batters/adamsri03_batting.csv
Saved batters/adamewi01_batting.csv
Saved batters/adelljo01_batting.csv
Saved batters/aguilry01_batting.csv
Saved batters/akiyash01_batting.csv
Saved batters/aguilje01_batting.csv
Saved batters/alcanke01_batting.csv
Saved batters/albieoz01_batting.csv
Saved batters/alcanse01_batting.csv
Saved batters/alberha01_batting.csv
Saved batters/alexabl01_batting.csv
Saved batters/alexacj01_batting.csv
Saved batters/alfarjo01_batting.csv
Saved batters/ahmedni01_batting.csv
Saved batters/allenau01_batting.csv
Saved batters/alforan01_batting.csv
Saved batters/allenni02_batting.csv
Saved batters/almonab01_batting.csv
Saved batters/allengr01_batting.csv
Saved batters/almoral01_batt

# PITCHERS

In [5]:
import requests
import pandas as pd
from datetime import datetime
import os

# --- CONFIG ---
player_id = 663158
bbref_id = 'suarero01'
current_year = datetime.now().year
years = list(range(2021, current_year + 1))
schedule_cache = {}
os.makedirs("pitchers", exist_ok=True)

# --- Field Translation ---
translation_map = {
    'inningsPitched': 'IP',
    'hits': 'H',
    'runs': 'R',
    'earnedRuns': 'ER',
    'baseOnBalls': 'BB',
    'strikeOuts': 'SO',
    'homeRuns': 'HR',
    'hitByPitch': 'HBP',
    'era': 'ERA',
    'fip': 'FIP',
    'battersFaced': 'BF',
    'pitchesThrown': 'Pit',
    'strikes': 'Str',
    'strikesLooking': 'StL',
    'strikesSwinging': 'StS',
    'groundOuts': 'GB',
    'flyOuts': 'FB',
    'inheritedRunners': 'IR',
    'inheritedRunnersScored': 'IS',
    'stolenBases': 'SB',
    'caughtStealing': 'CS',
    'pickoffs': 'PO',
    'atBats': 'AB',
    'doubles': '2B',
    'triples': '3B',
    'intentionalWalks': 'IBB',
    'groundIntoDoublePlay': 'GDP',
    'sacFlies': 'SF',
    'reachedOnError': 'ROE',
}

meta_columns = ['game_date', 'game_id', 'Team', 'Opp', 'away_id', 'home_id']
stat_columns = list(translation_map.values())
final_columns = meta_columns + stat_columns + ['dbl']

# --- Fetch schedule for mapping team + opponent + home/away ---
def get_schedule(team_id, season):
    key = (team_id, season)
    if key in schedule_cache:
        return schedule_cache[key]

    url = "https://statsapi.mlb.com/api/v1/schedule"
    params = {"teamId": team_id, "season": season, "sportId": 1}
    res = requests.get(url, params=params).json()
    sched = {}
    for date in res['dates']:
        for g in date['games']:
            gid = int(g['gamePk'])
            away = g['teams']['away']['team']['id']
            home = g['teams']['home']['team']['id']
            opp = home if away == team_id else away
            sched[gid] = {'Team': team_id, 'Opp': opp, 'away_id': away, 'home_id': home}
    schedule_cache[key] = sched
    return sched

# --- Fetch game logs for one pitcher ---
def fetch_pitcher_log(player_id):
    all_rows = []

    for season in years:
        url = f"https://statsapi.mlb.com/api/v1/people/{player_id}/stats"
        params = {"stats": "gameLog", "group": "pitching", "season": season}
        res = requests.get(url, params=params).json()

        if not res['stats'] or not res['stats'][0]['splits']:
            continue

        rows = []
        team_ids_used = set()

        for game in res['stats'][0]['splits']:
            raw = game['stat']
            row = {translation_map[k]: v for k, v in raw.items() if k in translation_map}
            row['game_id'] = game['game']['gamePk']
            row['game_date'] = game['date']
            team_id = game['team']['id']
            row['team_id'] = team_id
            team_ids_used.add(team_id)
            row['opp_id'] = game['opponent']['id']
            row['season'] = season
            rows.append(row)

        df = pd.DataFrame(rows)
        df['game_id'] = df['game_id'].astype(int)

        df['Team'] = None
        df['Opp'] = None
        df['away_id'] = None
        df['home_id'] = None

        for tid in team_ids_used:
            schedule = get_schedule(tid, season)
            for gid, game_row in df[df['team_id'] == tid].iterrows():
                info = schedule.get(game_row['game_id'], {})
                df.at[gid, 'Team'] = info.get('Team')
                df.at[gid, 'Opp'] = info.get('Opp')
                df.at[gid, 'away_id'] = info.get('away_id')
                df.at[gid, 'home_id'] = info.get('home_id')

        df['game_date'] = pd.to_datetime(df['game_date'])
        df = df.sort_values(['game_date', 'game_id'])
        dbl_counts = df.groupby('game_date').cumcount() + 1
        dbl_flags = df.groupby('game_date')['game_id'].transform('count')
        df['dbl'] = dbl_flags.where(dbl_flags > 1, None)
        df.loc[df['dbl'].notnull(), 'dbl'] = dbl_counts[df['dbl'].notnull()].astype(float)

        all_rows.append(df)

    if all_rows:
        final_df = pd.concat(all_rows)
        for col in final_columns:
            if col not in final_df.columns:
                final_df[col] = None
        final_df = final_df[final_columns]
        return final_df
    return None

# --- Run and save ---
df = fetch_pitcher_log(player_id)
if df is not None:
    filename = f"pitchers/{bbref_id}_pitching.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {filename} with {len(df)} rows and {len(df.columns)} columns")
else:
    print("No data found")


Saved pitchers/suarero01_pitching.csv with 139 rows and 36 columns
