# Pitcher Physical

This captures pitcher phyical traits (height, weight, handedness)

In [9]:
import pandas as pd
import requests
import time
from pathlib import Path

# Load and combine the game stats CSVs
pitcher_games = pd.read_csv("../../data/processed/pitcher_game_stats.csv")

# Get unique pitcher IDs
pitcher_ids = pitcher_games['player_id'].unique()

# Checkpoint file path
checkpoint_file = Path('../../data/interim/pitcher_physical_checkpoint.csv')

# Load existing progress if checkpoint exists
if checkpoint_file.exists():
    physical_df = pd.read_csv(checkpoint_file)
    print(f'Loaded dataset with {physical_df.shape[0]} entries')
    processed_pitchers = set(physical_df['player_id'])
else:
    physical_df = pd.DataFrame(columns=['player_id', 'name', 'height_inches', 'weight', 'throws', 'birth_date'])
    processed_pitchers = set()
    # Create checkpoint file and parent directories if they don't exist
    checkpoint_file.parent.mkdir(parents=True, exist_ok=True)
    physical_df.to_csv(checkpoint_file, index=False)

# Get pitchers left to process
remaining_pitchers = [pid for pid in pitcher_ids if pid not in processed_pitchers]

# Track errors
errors = []

# MLB API endpoint
mlb_api = "https://statsapi.mlb.com/api/v1/people/{}"

# Process in batches
batch_size = 50
for i in range(0, len(remaining_pitchers), batch_size):
    batch = remaining_pitchers[i:i + batch_size]
    
    # Make parallel requests using list comprehension
    responses = [requests.get(mlb_api.format(pid)) for pid in batch]
    time.sleep(1)  # Rate limiting between batches
    
    # Process responses
    batch_results = []
    for pitcher_id, response in zip(batch, responses):
        try:
            if response.status_code != 200:
                errors.append(f"Pitcher {pitcher_id}: Bad status code {response.status_code}")
                continue
                
            data = response.json()
            
            if 'people' not in data or len(data['people']) == 0:
                errors.append(f"Pitcher {pitcher_id}: Missing player data")
                continue
                
            player = data['people'][0]
            
            # Parse height string (e.g. "6' 7"") into total inches
            height = player.get('height', '')
            try:
                height_ft = int(height.split("'")[0])
                height_in = int(height.split("'")[1].strip().strip('"'))
                height_inches = (height_ft * 12) + height_in
            except:
                height_inches = None
                
            batch_results.append({
                'player_id': pitcher_id,
                'name': player.get('fullName'),
                'height_inches': height_inches,
                'weight': player.get('weight'),
                'throws': player.get('pitchHand', {}).get('code'),
                'birth_date': player.get('birthDate')
            })
                
        except Exception as e:
            errors.append(f"Pitcher {pitcher_id}: Error processing data: {str(e)}")

    # Append batch results to dataframe
    batch_df = pd.DataFrame(batch_results)
    physical_df = pd.concat([physical_df, batch_df], ignore_index=True)
    
    # Save checkpoint
    physical_df.to_csv(checkpoint_file, index=False)

# Save final results
physical_df.to_csv("../../data/processed/pitcher_physical.csv", index=False)
print(f"Saved physical data for {len(physical_df)} pitchers")

# Print errors
print("\nErrors found:")
for error in errors:
    print(error)


Loaded dataset with 10 entries
Saved physical data for 536 pitchers

Errors found:
