# Game Weather

For each game in the specified dataset, get the weather from that game.

In [10]:
import pandas as pd

# Load the pitcher game stats dataset
df = pd.read_csv('../../data/processed/pitcher_game_stats.csv')

# Extract unique game PKs
game_pks = df['game_pk'].unique()

In [13]:
import requests
import time
import numpy as np
from pathlib import Path

# Checkpoint file path
checkpoint_file = Path('../../data/interim/game_weather_checkpoint.csv')

# Load existing progress if checkpoint exists
if checkpoint_file.exists():
    weather_df = pd.read_csv(checkpoint_file)
    print(f'Loaded dataset with {weather_df.shape[0]} entries')
    processed_games = set(weather_df['game_pk'])
else:
    weather_df = pd.DataFrame(columns=['game_pk', 'condition', 'temp', 'wind_speed', 'wind_direction'])
    processed_games = set()
    # Create checkpoint file and parent directories if they don't exist
    checkpoint_file.parent.mkdir(parents=True, exist_ok=True)
    weather_df.to_csv(checkpoint_file, index=False)

# Get games left to process
remaining_games = [pk for pk in game_pks if pk not in processed_games]

# Track anomalies
anomalies = []

# Batch API requests
batch_size = 50
for i in range(0, len(remaining_games), batch_size):
    batch = remaining_games[i:i + batch_size]
    
    # Make parallel requests using list comprehension
    responses = [requests.get(f"https://statsapi.mlb.com/api/v1.1/game/{pk}/feed/live") for pk in batch]
    time.sleep(1)  # Rate limiting between batches
    
    # Process responses
    batch_results = []
    for game_pk, response in zip(batch, responses):
        try:
            if response.status_code != 200:
                anomalies.append(f"Game {game_pk}: Bad status code {response.status_code}")
                continue
                
            data = response.json()
            
            if 'gameData' not in data or 'weather' not in data['gameData']:
                anomalies.append(f"Game {game_pk}: Missing weather data")
                continue
                
            weather = data['gameData']['weather']
            
            # Get temperature
            try:
                temp = int(weather.get('temp', np.nan))
            except (ValueError, TypeError):
                temp = np.nan
                anomalies.append(f"Game {game_pk}: Invalid temperature")
            
            # Parse wind
            wind = weather.get('wind', '')
            if not wind:
                wind_speed = 0
                wind_direction = 'None'
            else:
                wind_parts = wind.split(',')
                if len(wind_parts) != 2:
                    wind_speed = np.nan
                    wind_direction = np.nan
                    anomalies.append(f"Game {game_pk}: Invalid wind format")
                else:
                    speed_str = wind_parts[0].strip().lower()
                    try:
                        speed = int(''.join(c for c in speed_str if c.isdigit()))
                        if 'kph' in speed_str:
                            speed = int(speed * 0.621371)
                        wind_speed = speed
                        wind_direction = wind_parts[1].strip()
                    except ValueError:
                        wind_speed = np.nan
                        wind_direction = np.nan
                        anomalies.append(f"Game {game_pk}: Invalid wind speed")
            
            batch_results.append({
                'game_pk': game_pk,
                'condition': weather.get('condition', np.nan),
                'temp': temp,
                'wind_speed': wind_speed,
                'wind_direction': wind_direction
            })
                
        except Exception as e:
            anomalies.append(f"Game {game_pk}: Error processing data: {str(e)}")
            batch_results.append({
                'game_pk': game_pk,
                'condition': np.nan,
                'temp': np.nan,
                'wind_speed': np.nan,
                'wind_direction': np.nan
            })

    # Append batch results to dataframe
    batch_df = pd.DataFrame(batch_results)
    weather_df = pd.concat([weather_df, batch_df], ignore_index=True)
    
    # Save checkpoint
    weather_df.to_csv(checkpoint_file, index=False)

# Print anomalies
print("\nAnomalies found:")
for anomaly in anomalies:
    print(anomaly)

Loaded dataset with 2850 entries

Anomalies found:


In [14]:
weather_df

Unnamed: 0,game_pk,condition,temp,wind_speed,wind_direction
0,706955,Partly Cloudy,81,16,Varies
1,706822,Sunny,79,6,L To R
2,662021,Cloudy,50,20,Out To RF
3,661292,Roof Closed,64,0,
4,661405,Dome,72,0,
...,...,...,...,...,...
8283,748110,Partly Cloudy,72,6,R To L
8284,746896,Sunny,46,8,In From CF
8285,745282,Partly Cloudy,62,8,Out To CF
8286,745849,Clear,84,6,In From LF
