### References 
1. Purucker, M. C.. “Neural network quarterbacking.” IEEE Potentials 15 (1996): 9-15.
2. PREDICTING MLB GAMES USING A MULTILAYER PERCEPTRON NEURAL NETWORK, RYAN LEWIS, MAY 1, 2023

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [2]:
# 1. Load data
sb = pd.read_csv('csv/scoreboard_2024_06_2025.csv')
bt = pd.read_csv('csv/batter_2024_06_2025.csv')
pt = pd.read_csv('csv/pitcher_2024_06_2025.csv')

print("="*70)
print("[STEP 1] Load data")
print("="*70)
print(f"Input: 3 CSV files (scoreboard, batter, pitcher)")
print(f"Expected Output: DataFrames created for all 3 files")
print(f"Real Output:")
print(f"  - scoreboard shape: {sb.shape}, columns: {list(sb.columns[:5])}")
print(f"  - batter shape: {bt.shape}, columns: {list(bt.columns[:5])}")
print(f"  - pitcher shape: {pt.shape}, columns: {list(pt.columns[:5])}")
print()

# Sort by date and binarize win flag
sb = sb.sort_values(['year', 'month', 'day', 'starttime'])
sb['win_binary'] = (sb['result'] == 1).astype(int)

print("="*70)
print("[STEP 2] Sort by date and binarize win flag")
print("="*70)
print(f"Input: scoreboard DataFrame (before sort)")
print(f"Expected Output: date-sorted DataFrame + win_binary column")
print(f"Real Output:")
print(f"  - scoreboard shape: {sb.shape}")
print(f"  - win_binary distribution: {sb['win_binary'].value_counts().to_dict()}")
print(f"  - sample:\n{sb[['year', 'month', 'day', 'result', 'win_binary']].head()}")
print()

# 2. Aggregate batter data to team totals per game (idx)
team_batting = bt.groupby('idx').agg({
    'hit': 'sum',
    'bat_num': 'sum'
}).reset_index()

print("="*70)
print("[STEP 3] Aggregate batter data to team totals per game")
print("="*70)
print(f"Input: batter DataFrame shape {bt.shape}")
print(f"Expected Output: hit and bat_num summed by idx")
print(f"Real Output:")
print(f"  - team_batting shape: {team_batting.shape}")
print(f"  - hit stats: min={team_batting['hit'].min()}, max={team_batting['hit'].max()}, mean={team_batting['hit'].mean():.2f}")
print(f"  - sample:\n{team_batting.head()}")
print()

# Join with base game info
base_df = pd.merge(sb[['idx', 'team', 'year', 'month', 'day', 'home', 'away', 'r', 'win_binary', 'dbheader']], 
                   team_batting, on='idx', how='left')

print("="*70)
print("[STEP 4] Join base game info with team batting data")
print("="*70)
print(f"Input: scoreboard columns + team_batting")
print(f"Expected Output: DataFrame merged on idx")
print(f"Real Output:")
print(f"  - base_df shape: {base_df.shape}")
print(f"  - columns: {list(base_df.columns)}")
print(f"  - sample:\n{base_df[['idx', 'team', 'hit', 'bat_num']].head()}")
print()

# 3. Match opponent runs scored/allowed (includes doubleheader dbheader)
# Add dbheader into game_id to distinguish doubleheaders.
base_df['game_id'] = (base_df['year'].astype(str) + 
                     base_df['month'].astype(str).str.zfill(2) + 
                     base_df['day'].astype(str).str.zfill(2) + "_" + 
                     base_df['home'] + "_" + base_df['away'] + "_" + 
                     base_df['dbheader'].astype(str))

print("="*70)
print("[STEP 5] Create game_id (with doubleheader)")
print("="*70)
print(f"Input: year, month, day, home, away, dbheader")
print(f"Expected Output: game_id formatted as YYYYMMDD_HOME_AWAY_DBHEADER")
print(f"Real Output:")
print(f"  - game_id sample: {base_df['game_id'].iloc[:3].tolist()}")
print(f"  - unique games: {base_df['game_id'].nunique()}")
print()

opp_scores = base_df[['game_id', 'team', 'r']].rename(columns={'team': 'opp_team', 'r': 'runs_allowed'})
base_df = pd.merge(base_df, opp_scores, on='game_id')
base_df = base_df[base_df['team'] != base_df['opp_team']].copy()

print("="*70)
print("[STEP 6] Match opponent runs allowed")
print("="*70)
print(f"Input: base_df game_id, team, r")
print(f"Expected Output: two rows per game (home/away) with runs_allowed")
print(f"Real Output:")
print(f"  - base_df shape (after filter): {base_df.shape}")
print(f"  - runs_allowed stats: min={base_df['runs_allowed'].min()}, max={base_df['runs_allowed'].max()}, mean={base_df['runs_allowed'].mean():.2f}")
print(f"  - sample:\n{base_df[['game_id', 'team', 'opp_team', 'r', 'runs_allowed']].head()}")
print()

# 4. [Features 1, 2, 3] Rolling averages over last 30 games (keep columns via transform)
# Note: if only July games exist and there are fewer than 30, set min_periods to allow NaN.
base_df = base_df.sort_values(['team', 'year', 'month', 'day', 'game_id'])

# Average runs scored and allowed
base_df['f1_avg_runs_scored_30'] = base_df.groupby('team')['r'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).mean())
base_df['f2_avg_runs_allowed_30'] = base_df.groupby('team')['runs_allowed'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).mean())

print("="*70)
print("[STEP 7-1] Rolling average over last 30 games (runs scored/allowed)")
print("="*70)
print(f"Input: team-sorted r and runs_allowed")
print(f"Expected Output: mean runs scored/allowed over last 30 games (NaN allowed)")
print(f"Real Output:")
print(f"  - f1_avg_runs_scored_30: NaN={base_df['f1_avg_runs_scored_30'].isna().sum()}, min={base_df['f1_avg_runs_scored_30'].min():.2f}, max={base_df['f1_avg_runs_scored_30'].max():.2f}")
print(f"  - f2_avg_runs_allowed_30: NaN={base_df['f2_avg_runs_allowed_30'].isna().sum()}, min={base_df['f2_avg_runs_allowed_30'].min():.2f}, max={base_df['f2_avg_runs_allowed_30'].max():.2f}")
print(f"  - sample:\n{base_df[['team', 'f1_avg_runs_scored_30', 'f2_avg_runs_allowed_30']].head(10)}")
print()

# Team batting average (cumulative sums are more accurate)
rolling_hits = base_df.groupby('team')['hit'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).sum())
rolling_ab = base_df.groupby('team')['bat_num'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).sum())
base_df['f3_team_batting_avg_30'] = rolling_hits / rolling_ab

print("="*70)
print("[STEP 7-2] Team batting average over last 30 games")
print("="*70)
print(f"Input: 30-game rolling sums of team hit and bat_num")
print(f"Expected Output: team batting average = rolling_hits / rolling_ab")
print(f"Real Output:")
print(f"  - f3_team_batting_avg_30: NaN={base_df['f3_team_batting_avg_30'].isna().sum()}, min={base_df['f3_team_batting_avg_30'].min():.4f}, max={base_df['f3_team_batting_avg_30'].max():.4f}")
print(f"  - sample:\n{base_df[['team', 'hit', 'bat_num', 'f3_team_batting_avg_30']].head(10)}")
print()

# 5. [Feature 4] Starting pitcher season average runs allowed
starters = pt[pt['mound'] == 1][['idx', 'name', 'losescore']].copy()
starters = pd.merge(starters, sb[['idx', 'year', 'month', 'day']], on='idx')
starters = starters.sort_values(['name', 'year', 'month', 'day'])
starters['f4_pitcher_runs_avg'] = starters.groupby('name')['losescore'].transform(lambda x: x.shift(1).expanding().mean())

print("="*70)
print("[STEP 8] Starting pitcher season average runs allowed")
print("="*70)
print(f"Input: pitcher data (filtered to mound==1)")
print(f"Expected Output: cumulative mean runs allowed per pitcher (expanding mean)")
print(f"Real Output:")
print(f"  - starters shape: {starters.shape}")
print(f"  - f4_pitcher_runs_avg: NaN={starters['f4_pitcher_runs_avg'].isna().sum()}, min={starters['f4_pitcher_runs_avg'].min():.2f}, max={starters['f4_pitcher_runs_avg'].max():.2f}")
print(f"  - sample:\n{starters[['name', 'losescore', 'f4_pitcher_runs_avg']].head(10)}")
print()

base_df = pd.merge(base_df, starters[['idx', 'f4_pitcher_runs_avg']], on='idx', how='left')

print("="*70)
print("[STEP 8-2] Add pitcher info to base_df")
print("="*70)
print(f"Input: base_df + starters f4_pitcher_runs_avg")
print(f"Expected Output: merge on idx")
print(f"Real Output:")
print(f"  - base_df shape: {base_df.shape}")
print(f"  - f4_pitcher_runs_avg: NaN={base_df['f4_pitcher_runs_avg'].isna().sum()}")
print()

# 6. [Feature 5] Team overall win rate
base_df['f5_total_win_pct'] = base_df.groupby('team')['win_binary'].transform(lambda x: x.shift(1).expanding().mean())

print("="*70)
print("[STEP 9] Team overall win rate")
print("="*70)
print(f"Input: team win_binary")
print(f"Expected Output: cumulative win rate (expanding mean)")
print(f"Real Output:")
print(f"  - f5_total_win_pct: NaN={base_df['f5_total_win_pct'].isna().sum()}, min={base_df['f5_total_win_pct'].min():.4f}, max={base_df['f5_total_win_pct'].max():.4f}")
print(f"  - sample:\n{base_df[['team', 'win_binary', 'f5_total_win_pct']].head(10)}")
print()

# 7. [Feature 6] Home/away win rate
def calc_ha_win_pct(df):
    df = df.copy()
    df['is_home'] = (df['team'] == df['home'])
    # Compute win rate separately for home vs away for each team
    df['f6_ha_win_pct'] = df.groupby(['team', 'is_home'])['win_binary'].transform(lambda x: x.shift(1).expanding().mean())
    return df

base_df = calc_ha_win_pct(base_df)

print("="*70)
print("[STEP 10] Home/away win rate")
print("="*70)
print(f"Input: team and is_home (home/away flag)")
print(f"Expected Output: cumulative win rate by team-is_home")
print(f"Real Output:")
print(f"  - f6_ha_win_pct: NaN={base_df['f6_ha_win_pct'].isna().sum()}, min={base_df['f6_ha_win_pct'].min():.4f}, max={base_df['f6_ha_win_pct'].max():.4f}")
print(f"  - is_home distribution: {base_df['is_home'].value_counts().to_dict()}")
print(f"  - sample:\n{base_df[['team', 'is_home', 'win_binary', 'f6_ha_win_pct']].head(10)}")
print()

# 8. Build final MLP dataset
feature_cols = ['f1_avg_runs_scored_30', 'f2_avg_runs_allowed_30', 'f3_team_batting_avg_30', 
                'f4_pitcher_runs_avg', 'f5_total_win_pct', 'f6_ha_win_pct']

# Split home-team and away-team data then join
home_df = base_df[base_df['team'] == base_df['home']][['game_id', 'win_binary'] + feature_cols]
home_df.columns = ['game_id', 'home_win'] + ['h_' + c for c in feature_cols]

away_df = base_df[base_df['team'] == base_df['away']][['game_id'] + feature_cols]
away_df.columns = ['game_id'] + ['a_' + c for c in feature_cols]

print("="*70)
print("[STEP 11-1] Split home/away data")
print("="*70)
print(f"Input: all game records in base_df")
print(f"Expected Output: separate rows for home and away teams")
print(f"Real Output:")
print(f"  - home_df shape: {home_df.shape}")
print(f"  - away_df shape: {away_df.shape}")
print(f"  - home_df sample:\n{home_df.head()}")
print(f"  - away_df sample:\n{away_df.head()}")
print()

final_dataset = pd.merge(home_df, away_df, on='game_id').dropna()

print("="*70)
print("[STEP 11-2] Merge home/away data and drop NaN")
print("="*70)
print(f"Input: home_df, away_df (on game_id)")
print(f"Expected Output: rows with both home/away features per game_id")
print(f"Real Output:")
print(f"  - rows before merge: {len(home_df)}")
print(f"  - rows after merge (before dropna): {len(pd.merge(home_df, away_df, on='game_id'))}")
print(f"  - final_dataset shape (after dropna): {final_dataset.shape}")
print(f"  - columns: {list(final_dataset.columns)}")
print()

# Target: 0 if home team wins, 1 if away team wins
final_dataset['target'] = (final_dataset['home_win'] == 0).astype(int)
final_dataset = final_dataset.drop(columns=['home_win'])

print("="*70)
print("[STEP 12] Create target label")
print("="*70)
print(f"Input: home_win (home win=1, away win=0)")
print(f"Expected Output: target (home win=0, away win=1)")
print(f"Real Output:")
print(f"  - target distribution: {final_dataset['target'].value_counts().to_dict()}")
print(f"  - target ratio: home wins (0)={final_dataset[final_dataset['target']==0].shape[0]/len(final_dataset):.2%}, away wins (1)={final_dataset[final_dataset['target']==1].shape[0]/len(final_dataset):.2%}")
print(f"  - sample:\n{final_dataset[['game_id', 'target']].head()}")
print()

# Save
final_dataset.to_csv('kbo_mlp_training_data.csv', index=False)

print("="*70)
print("[STEP 13] Save final dataset")
print("="*70)
print(f"Input: final_dataset")
print(f"Expected Output: kbo_mlp_training_data.csv saved")
print(f"Real Output:")
print(f"  - filename: kbo_mlp_training_data.csv")
print(f"  - final shape: {final_dataset.shape}")
print(f"  - column count: {len(final_dataset.columns)}")
print(f"  - columns: {list(final_dataset.columns)}")
print(f"\nTraining dataset built: {len(final_dataset)} games included")
print("="*70)

[STEP 1] Load data
Input: 3 CSV files (scoreboard, batter, pitcher)
Expected Output: DataFrames created for all 3 files
Real Output:
  - scoreboard shape: (2434, 37), columns: ['idx', 'team', 'result', 'i_1', 'i_2']
  - batter shape: (31465, 26), columns: ['idx', 'name', 'team', 'position', 'i_1']
  - pitcher shape: (11983, 15), columns: ['idx', 'name', 'team', 'mound', 'inning']

[STEP 2] Sort by date and binarize win flag
Input: scoreboard DataFrame (before sort)
Expected Output: date-sorted DataFrame + win_binary column
Real Output:
  - scoreboard shape: (2434, 38)
  - win_binary distribution: {0: 1247, 1: 1187}
  - sample:
   year  month  day  result  win_binary
0  2024      6    1       1           1
1  2024      6    1      -1           0
4  2024      6    1      -1           0
5  2024      6    1       1           1
6  2024      6    1      -1           0

[STEP 3] Aggregate batter data to team totals per game
Input: batter DataFrame shape (31465, 26)
Expected Output: hit and ba

### scikit-learn

In [3]:
# 1. Load preprocessed data
df = pd.read_csv('kbo_mlp_training_data.csv')

In [4]:
# 2. Split features (X) and labels (y)
# game_id is an identifier and excluded; predict target (0: home win, 1: away win)
X = df.drop(columns=['game_id', 'target'])
y = df['target']

In [5]:
# 3. Split data (75% train, 25% test)
# shuffle=False keeps chronological order for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

# 4. Scale data with StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# 5. Compare three optimization algorithms (solvers)
solvers = ['sgd', 'adam', 'lbfgs']
results = {}

for s in solvers:
    # Use config from reference: hidden layer (3,), activation relu, max_iter 1000
    clf = MLPClassifier(
    hidden_layer_sizes=(3,),
    activation='relu',
    solver=s,
    max_iter=1000, 
    random_state=42
    )
    
    # Train model
    clf.fit(X_train_scaled, y_train)
    
    # Predict and compute accuracy
    y_pred = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[s] = acc
    print(f"Algorithm: {s.upper()} | Accuracy: {acc:.1%}")

Algorithm: SGD | Accuracy: 52.2%
Algorithm: ADAM | Accuracy: 56.2%
Algorithm: LBFGS | Accuracy: 49.3%


In [7]:
# Identify algorithm with highest accuracy
best_solver = max(results, key=results.get)
print(f"\nRecommended algorithm: {best_solver.upper()} (accuracy {results[best_solver]:.1%})")


Recommended algorithm: ADAM (accuracy 56.2%)


In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# 1. Scale data (critical for neural nets)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Define parameter grid to test
param_grid = {
    'hidden_layer_sizes': [(3,), (6,), (12,), (6, 3), (12, 6)], # try wider or deeper
    'solver': ['sgd', 'adam', 'lbfgs'],
    'activation': ['relu', 'tanh'],
    'max_iter': [1000, 5000] # allow sufficient iterations
}

# 3. Run grid search (test all combinations to find the best)
mlp = MLPClassifier(random_state=1)
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 4. Report results
print(f"Best CV accuracy: {grid_search.best_score_:.1%}")
print(f"Best params: {grid_search.best_params_}")

# 5. Evaluate best model on test data
best_model = grid_search.best_estimator_
test_acc = best_model.score(X_test_scaled, y_test)
print(f"Final test accuracy: {test_acc:.1%}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of it

Best CV accuracy: 54.9%
Best params: {'activation': 'tanh', 'hidden_layer_sizes': (6,), 'max_iter': 1000, 'solver': 'lbfgs'}
Final test accuracy: 51.1%


In [9]:
# 1. Load and preprocess data
df = pd.read_csv('kbo_mlp_training_data.csv')
X = df.drop(columns=['game_id', 'target']).values
y = df['target'].values.reshape(-1, 1)

# Split data (keep chronological order)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to tensors
X_train_t = torch.FloatTensor(X_train_scaled)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test_scaled)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=72, shuffle=True)

# 2. Model design (add Dropout for regularization)
class AdvancedKBOPredictor(nn.Module):
    def __init__(self):
        super(AdvancedKBOPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(12, 8),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(8, 4),
            nn.Tanh(),
            nn.Linear(4, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# Model, loss, optimizer
model = AdvancedKBOPredictor()
criterion = nn.BCELoss()
# weight_decay adds L2 regularization to keep weights controlled
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-2)

# 3. Training loop
epochs = 200
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # Print evaluation every 20 epochs
    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_t)
            train_acc = ((train_outputs > 0.5).float() == y_train_t).float().mean()
            
            test_outputs = model(X_test_t)
            test_acc = ((test_outputs > 0.5).float() == y_test_t).float().mean()
            
            print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f} | Train Acc: {train_acc:.1%} | Test Acc: {test_acc:.1%}")

# 4. Final evaluation
model.eval()
with torch.no_grad():
    final_pred = (model(X_test_t) > 0.5).float()
    print(f"\n[Final test accuracy]: {accuracy_score(y_test, final_pred):.2%}")

Epoch [20/200] Loss: 0.6814 | Train Acc: 51.7% | Test Acc: 54.8%
Epoch [40/200] Loss: 0.6784 | Train Acc: 53.5% | Test Acc: 48.8%
Epoch [60/200] Loss: 0.6905 | Train Acc: 53.7% | Test Acc: 50.7%
Epoch [80/200] Loss: 0.6946 | Train Acc: 53.7% | Test Acc: 51.2%
Epoch [100/200] Loss: 0.6976 | Train Acc: 54.6% | Test Acc: 53.5%
Epoch [120/200] Loss: 0.6813 | Train Acc: 54.0% | Test Acc: 53.9%
Epoch [140/200] Loss: 0.6842 | Train Acc: 54.5% | Test Acc: 56.7%
Epoch [160/200] Loss: 0.7154 | Train Acc: 55.4% | Test Acc: 55.8%
Epoch [180/200] Loss: 0.6792 | Train Acc: 54.5% | Test Acc: 54.4%
Epoch [200/200] Loss: 0.6980 | Train Acc: 54.8% | Test Acc: 55.3%

[Final test accuracy]: 55.30%


### Result: 
Loss
- How wrong the model is(lower is better)

Train Accuracy
- Practice test score
- 54: the model has only barely learned a pattern

Test Accuracy
- Real exam score
- 55
Current model status : Not overfitting, Not a lack of training epochs, The information, features, is weak, or the model is too simple

232425 notebook has more data, but performance with 24June - 2025 data is better