In [24]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import os


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [25]:
# Check the current working directory
current_dir = os.getcwd()

# Define the file paths
hitters_file_path = os.path.join(current_dir, '..', 'data', 'cleaned_matched_hitters.csv')
pitchers_file_path = os.path.join(current_dir, '..', 'data', 'cleaned_matched_pitchers.csv')
# Load the datasets
hitters_df = pd.read_csv(hitters_file_path)
pitchers_df = pd.read_csv(pitchers_file_path)

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Define a function to apply label encoding
def label_encode(df, columns):
    label_encoder = LabelEncoder()
    for col in columns:
        df[col] = label_encoder.fit_transform(df[col].astype(str))  # Ensuring it's a string before encoding
    return df

def preprocess_hitters(hitters_df, filter_years=True):
    # Step 1: Filter out rows with Year value of 2023 or 2024 if filter_years is True
    if filter_years:
        hitters_df = hitters_df[~hitters_df['Year'].isin([2023, 2024])]

    # Step 2: Drop irrelevant columns except 'Name'
    drop_cols = ['Avg EV', 'Hard Hit%', 'Max EV', 'ETA', 'Risk', 'Sign Yr', 'Year', 'MLB_Debut', 'Games_Played', 'Years_Before_Debut']
    hitters_df = hitters_df.drop(columns=drop_cols)

    # Step 3: Impute placeholder value for players who have not debuted (Years_Played == 0)
    hitters_df['Years_Played'] = hitters_df['Years_Played'].replace(0, np.nan)
    hitters_df['Years_Played'] = hitters_df['Years_Played'].fillna(-1)

    # Step 4: Calculate WAR_per_Year
    hitters_df['WAR_per_Year'] = hitters_df.apply(lambda row: row['WAR_Sum'] / row['Years_Played'] if row['Years_Played'] > 0 else 0, axis=1)
    hitters_df = hitters_df.drop(columns=['WAR_Sum', 'Years_Played'])

    # Step 5: Handle missing values in Top 100 (fill missing with 999 for unranked players)
    hitters_df['Top 100'] = hitters_df['Top 100'].fillna(999)

    # Step 6: Impute or handle other missing values
    hitters_df['Bat Ctrl'] = hitters_df['Bat Ctrl'].fillna(-1)
    hitters_df['Pitch Sel'] = hitters_df['Pitch Sel'].fillna(-1)
    hitters_df['Fld'] = hitters_df['Fld'].fillna(hitters_df['Fld'].median())
    hitters_df['Bonus'] = hitters_df['Bonus'].fillna(0)
    
    for col in ['Age', 'Ht', 'Wt']:
        hitters_df[col] = hitters_df[col].fillna(hitters_df[col].median())

    # Fill categorical values with 'Unknown' for missing entries
    hitters_df['B'] = hitters_df['B'].fillna('Unknown')
    hitters_df['T'] = hitters_df['T'].fillna('Unknown')
    hitters_df['Sign Mkt'] = hitters_df['Sign Mkt'].fillna('Unknown')

    # Step 7: Label encode categorical columns
    categorical_cols = ['Pos', 'Org', 'B', 'T', 'Sign Mkt']
    hitters_df = label_encode(hitters_df, categorical_cols)

    # Step 8: Scale the features
    scaler = StandardScaler()
    hitters_df_scaled = pd.DataFrame(scaler.fit_transform(hitters_df.drop(columns=['Name'])), columns=hitters_df.drop(columns=['Name']).columns)

    return hitters_df_scaled, scaler

def preprocess_pitchers(pitchers_df, filter_years=True):
    # Step 1: Filter out rows with Year value of 2023 or 2024 if filter_years is True
    if filter_years:
        pitchers_df = pitchers_df[~pitchers_df['Year'].isin([2023, 2024])]

    # Step 2: Drop irrelevant columns except 'Name'
    drop_cols = ['RPM Break', 'RPM FB', 'TJ Date', 'ETA', 'Risk', 'Sign Yr', 'Year', 'MLB_Debut', 'Fld', 'Avg FB Velo', 'Games_Played','Years_Before_Debut']
    pitchers_df = pitchers_df.drop(columns=drop_cols)

    # Step 3: Impute placeholder value for players who have not debuted (Years_Played == 0)
    pitchers_df['Years_Played'] = pitchers_df['Years_Played'].replace(0, np.nan)
    pitchers_df['Years_Played'] = pitchers_df['Years_Played'].fillna(-1)

    # Step 4: Calculate WAR_per_Year
    pitchers_df['WAR_per_Year'] = pitchers_df.apply(lambda row: row['WAR_Sum'] / row['Years_Played'] if row['Years_Played'] > 0 else 0, axis=1)
    pitchers_df = pitchers_df.drop(columns=['WAR_Sum', 'Years_Played'])

    # Step 5: Handle missing values in Top 100 (fill missing with 999 for unranked players)
    pitchers_df['Top 100'] = pitchers_df['Top 100'].fillna(999)

    # Step 6: Handle missing values for pitch-specific skills
    pitch_skill_cols = ['SL', 'CB', 'CH', 'Tops']
    for col in pitch_skill_cols:
        pitchers_df[col] = pitchers_df[col].fillna(0)
    
    pitchers_df['CMD'] = pitchers_df['CMD'].fillna(pitchers_df['CMD'].median())
    pitchers_df['Bonus'] = pitchers_df['Bonus'].fillna(0)

    for col in ['Age', 'Ht', 'Wt']:
        pitchers_df[col] = pitchers_df[col].fillna(pitchers_df[col].median())
    
    pitchers_df['B'] = pitchers_df['B'].fillna('Unknown')
    pitchers_df['T'] = pitchers_df['T'].fillna('Unknown')
    pitchers_df['FB Type'] = pitchers_df['FB Type'].fillna('Unknown')
    pitchers_df['Sign Mkt'] = pitchers_df['Sign Mkt'].fillna('Unknown')

    # Step 7: Label encode categorical columns
    categorical_cols = ['Org', 'Pos', 'B', 'T', 'FB Type', 'Sign Mkt']
    pitchers_df = label_encode(pitchers_df, categorical_cols)

    # Step 8: Scale the features
    scaler = StandardScaler()
    pitchers_df_scaled = pd.DataFrame(scaler.fit_transform(pitchers_df.drop(columns=['Name'])), columns=pitchers_df.drop(columns=['Name']).columns)

    return pitchers_df_scaled, scaler
# Apply preprocessing functions
hitters_df, scaler_hitters = preprocess_hitters(hitters_df)
pitchers_df, scaler_pitchers = preprocess_pitchers(pitchers_df)

# Extract the target variable and features
# For hitters
y_train_hit = hitters_df['WAR_per_Year']
X_train_hit = hitters_df.drop(columns=['WAR_per_Year'])

# For pitchers
y_train_pitch = pitchers_df['WAR_per_Year']
X_train_pitch = pitchers_df.drop(columns=['WAR_per_Year'])

# Split data into training and validation sets
X_train_hit, X_val_hit, y_train_hit, y_val_hit = train_test_split(X_train_hit, y_train_hit, test_size=0.2, random_state=42)
X_train_pitch, X_val_pitch, y_train_pitch, y_val_pitch = train_test_split(X_train_pitch, y_train_pitch, test_size=0.2, random_state=42)

# Ensure all columns are numeric
X_train_hit = X_train_hit.apply(pd.to_numeric, errors='ignore')
X_val_hit = X_val_hit.apply(pd.to_numeric, errors='ignore')
X_train_pitch = X_train_pitch.apply(pd.to_numeric, errors='ignore')
X_val_pitch = X_val_pitch.apply(pd.to_numeric, errors='ignore')

# Fill NaN values with the median of each column
X_train_hit = X_train_hit.fillna(X_train_hit.median())
X_val_hit = X_val_hit.fillna(X_val_hit.median())
X_train_pitch = X_train_pitch.fillna(X_train_pitch.median())
X_val_pitch = X_val_pitch.fillna(X_val_pitch.median())

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WARPerYearPredictor(nn.Module):
    def __init__(self, input_dim):
        super(WARPerYearPredictor, self).__init__()
        
        # Define the layers
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 1)
        
        # Define dropout layers for regularization
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):
        # Apply the first fully connected layer and ReLU activation
        x = F.relu(self.fc1(x))
        
        # Apply dropout
        x = self.dropout(x)
        
        # Apply the second fully connected layer and ReLU activation
        x = F.relu(self.fc2(x))
        
        # Apply dropout
        x = self.dropout(x)
        
        # Apply the third fully connected layer and ReLU activation
        x = F.relu(self.fc3(x))
        
        # Apply dropout
        x = self.dropout(x)
        
        # Apply the fourth fully connected layer and ReLU activation
        x = F.relu(self.fc4(x))
        
        # Apply dropout
        x = self.dropout(x)
        
        # Apply the fifth fully connected layer and ReLU activation
        x = F.relu(self.fc5(x))
        
        # Apply the final fully connected layer
        x = self.fc6(x)
        
        return x

In [28]:
import torch.optim as optim

# Define the input dimension based on the number of features in the dataset
input_dim_hitters = X_train_hit.shape[1]
input_dim_pitchers = X_train_pitch.shape[1]

# Instantiate the model for hitters and pitchers
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_hitters = WARPerYearPredictor(input_dim=input_dim_hitters).to(device)
model_pitchers = WARPerYearPredictor(input_dim=input_dim_pitchers).to(device)

# Define hyperparameters
learning_rate = 0.001
num_epochs = 100
batch_size = 32

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer_hitters = optim.Adam(model_hitters.parameters(), lr=learning_rate)
optimizer_pitchers = optim.Adam(model_pitchers.parameters(), lr=learning_rate)


In [29]:
def train_model(model, optimizer, criterion, X_train, y_train, X_val, y_val, num_epochs, batch_size, device):
    model.train()
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        epoch_train_loss = 0.0
        for i in range(0, len(X_train), batch_size):
            X_batch = torch.tensor(X_train[i:i+batch_size].values, dtype=torch.float32).to(device)
            y_batch = torch.tensor(y_train[i:i+batch_size].values, dtype=torch.float32).to(device).view(-1, 1)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            epoch_train_loss += loss.item()

        train_losses.append(epoch_train_loss / len(X_train))

        # Validation phase
        model.eval()
        epoch_val_loss = 0.0
        with torch.no_grad():
            for i in range(0, len(X_val), batch_size):
                X_batch = torch.tensor(X_val[i:i+batch_size].values, dtype=torch.float32).to(device)
                y_batch = torch.tensor(y_val[i:i+batch_size].values, dtype=torch.float32).to(device).view(-1, 1)

                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                epoch_val_loss += loss.item()

        val_losses.append(epoch_val_loss / len(X_val))

        # Print loss every 5 epochs
        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')

    return train_losses, val_losses

# Train the model for hitters
train_losses_hitters, val_losses_hitters = train_model(
    model=model_hitters,
    optimizer=optimizer_hitters,
    criterion=criterion,
    X_train=X_train_hit,
    y_train=y_train_hit,
    X_val=X_val_hit,
    y_val=y_val_hit,
    num_epochs=num_epochs,
    batch_size=batch_size,
    device=device
)

# Train the model for pitchers
train_losses_pitchers, val_losses_pitchers = train_model(
    model=model_pitchers,
    optimizer=optimizer_pitchers,
    criterion=criterion,
    X_train=X_train_pitch,
    y_train=y_train_pitch,
    X_val=X_val_pitch,
    y_val=y_val_pitch,
    num_epochs=num_epochs,
    batch_size=batch_size,
    device=device
)

Epoch [5/100], Train Loss: 0.0271, Val Loss: 0.0226
Epoch [10/100], Train Loss: 0.0255, Val Loss: 0.0234
Epoch [15/100], Train Loss: 0.0248, Val Loss: 0.0226
Epoch [20/100], Train Loss: 0.0239, Val Loss: 0.0220
Epoch [25/100], Train Loss: 0.0234, Val Loss: 0.0227
Epoch [30/100], Train Loss: 0.0209, Val Loss: 0.0213
Epoch [35/100], Train Loss: 0.0214, Val Loss: 0.0210
Epoch [40/100], Train Loss: 0.0197, Val Loss: 0.0215
Epoch [45/100], Train Loss: 0.0194, Val Loss: 0.0207
Epoch [50/100], Train Loss: 0.0190, Val Loss: 0.0204
Epoch [55/100], Train Loss: 0.0183, Val Loss: 0.0199
Epoch [60/100], Train Loss: 0.0163, Val Loss: 0.0189
Epoch [65/100], Train Loss: 0.0178, Val Loss: 0.0195
Epoch [70/100], Train Loss: 0.0171, Val Loss: 0.0190
Epoch [75/100], Train Loss: 0.0169, Val Loss: 0.0183
Epoch [80/100], Train Loss: 0.0164, Val Loss: 0.0186
Epoch [85/100], Train Loss: 0.0162, Val Loss: 0.0192
Epoch [90/100], Train Loss: 0.0156, Val Loss: 0.0191
Epoch [95/100], Train Loss: 0.0144, Val Loss: 0

In [41]:
# Reload the datasets
new_hitters_df = pd.read_csv(hitters_file_path)
new_pitchers_df = pd.read_csv(pitchers_file_path)

# Filter the data for rows where the "Year" is 2024
hitters_2024_df = new_hitters_df[new_hitters_df['Year'] == 2024]
pitchers_2024_df = new_pitchers_df[new_pitchers_df['Year'] == 2024]

# Extract the Name column
hitters_names_2024 = hitters_2024_df['Name']
pitchers_names_2024 = pitchers_2024_df['Name']

# Preprocess the data without filtering out the year 2024
hitters_2024_df, scaler_hitters_2024 = preprocess_hitters(hitters_2024_df, filter_years=False)
pitchers_2024_df, scaler_pitchers_2024 = preprocess_pitchers(pitchers_2024_df, filter_years=False)

# Ensure all columns are numeric
hitters_2024_df = hitters_2024_df.apply(pd.to_numeric, errors='ignore')
pitchers_2024_df = pitchers_2024_df.apply(pd.to_numeric, errors='ignore')

# Extract features (drop the target variable if it exists)
X_hitters_2024 = hitters_2024_df.drop(columns=['WAR_per_Year'], errors='ignore')
X_pitchers_2024 = pitchers_2024_df.drop(columns=['WAR_per_Year'], errors='ignore')

# Print the shape of the features to ensure they match the model's expectations
print("Shape of X_hitters_2024:", X_hitters_2024.shape)
print("Shape of X_pitchers_2024:", X_pitchers_2024.shape)
# Print column names
print(X_hitters_2024.columns)
print(X_pitchers_2024.columns)

# Convert to tensors
X_hitters_2024_tensor = torch.tensor(X_hitters_2024.values, dtype=torch.float32).to(device)
X_pitchers_2024_tensor = torch.tensor(X_pitchers_2024.values, dtype=torch.float32).to(device)

# Set the models to evaluation mode
model_hitters.eval()
model_pitchers.eval()

# Make predictions
with torch.no_grad():
    predictions_hitters_2024 = model_hitters(X_hitters_2024_tensor).cpu().numpy()
    predictions_pitchers_2024 = model_pitchers(X_pitchers_2024_tensor).cpu().numpy()

# Extrapolate the predicted WAR per year to 6 years
predicted_war_6_years_hitters = predictions_hitters_2024 * 6
predicted_war_6_years_pitchers = predictions_pitchers_2024 * 6

# Unscale the data
hitters_2024_unscaled = pd.DataFrame(scaler_hitters_2024.inverse_transform(hitters_2024_df), columns=hitters_2024_df.columns)
pitchers_2024_unscaled = pd.DataFrame(scaler_pitchers_2024.inverse_transform(pitchers_2024_df), columns=pitchers_2024_df.columns)

# Create DataFrames with the Name, Top 100, and extrapolated WAR
hitters_2024_results = pd.DataFrame({
    'Name': hitters_names_2024.values[:len(predicted_war_6_years_hitters)],
    'Predicted_WAR_6_Years': predicted_war_6_years_hitters.flatten(),
    'Top 100': hitters_2024_unscaled['Top 100'].values[:len(predicted_war_6_years_hitters)]
})

pitchers_2024_results = pd.DataFrame({
    'Name': pitchers_names_2024.values[:len(predicted_war_6_years_pitchers)],
    'Predicted_WAR_6_Years': predicted_war_6_years_pitchers.flatten(),
    'Top 100': pitchers_2024_unscaled['Top 100'].values[:len(predicted_war_6_years_pitchers)]
})

# Remove rows with Top 100 value of 999
hitters_2024_results = hitters_2024_results[hitters_2024_results['Top 100'] != 999]
pitchers_2024_results = pitchers_2024_results[pitchers_2024_results['Top 100'] != 999]

# Sort by Top 100
hitters_2024_results = hitters_2024_results.sort_values(by='Top 100')
pitchers_2024_results = pitchers_2024_results.sort_values(by='Top 100')

# Save the results to CSV files with a different encoder to maintain Latin players' names
hitters_2024_results.to_csv('../data/generated/hitters_2024_predicted_war_6_years.csv', index=False, encoding='utf-8-sig')
pitchers_2024_results.to_csv('../data/generated/pitchers_2024_predicted_war_6_years.csv', index=False, encoding='utf-8-sig')

Shape of X_hitters_2024: (237, 19)
Shape of X_pitchers_2024: (249, 19)
Index(['Bat Ctrl', 'Fld', 'Game', 'Hit', 'Org', 'Pitch Sel', 'Pos', 'Raw',
       'Spd', 'Age', 'B', 'Bonus', 'FV', 'Ht', 'Org Rk', 'Sign Mkt', 'T',
       'Top 100', 'Wt'],
      dtype='object')
Index(['CB', 'CH', 'CMD', 'FB', 'FB Type', 'Org', 'Pos', 'SL', 'Tops', 'Age',
       'B', 'Bonus', 'FV', 'Ht', 'Org Rk', 'Sign Mkt', 'T', 'Top 100', 'Wt'],
      dtype='object')


OSError: Cannot save file into a non-existent directory: 'data\generated'

In [38]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to calculate permutation feature importance
def permutation_feature_importance(model, X_val, y_val, device, n_repeats=5):
    baseline_preds = model(torch.tensor(X_val.values, dtype=torch.float32).to(device)).cpu().detach().numpy()
    baseline_error = mean_squared_error(y_val, baseline_preds)
    
    feature_importances = np.zeros(X_val.shape[1])
    
    for col in range(X_val.shape[1]):
        errors = []
        for _ in range(n_repeats):
            X_val_permuted = X_val.copy()
            X_val_permuted.iloc[:, col] = np.random.permutation(X_val_permuted.iloc[:, col])
            permuted_preds = model(torch.tensor(X_val_permuted.values, dtype=torch.float32).to(device)).cpu().detach().numpy()
            permuted_error = mean_squared_error(y_val, permuted_preds)
            errors.append(permuted_error)
        feature_importances[col] = np.mean(errors) - baseline_error
    
    return feature_importances

# Calculate feature importance for hitters
hitters_feature_importances = permutation_feature_importance(model_hitters, X_val_hit, y_val_hit, device)

# Calculate feature importance for pitchers
pitchers_feature_importances = permutation_feature_importance(model_pitchers, X_val_pitch, y_val_pitch, device)

# Create DataFrames for feature importances
hitters_feature_importance_df = pd.DataFrame({
    'Feature': X_val_hit.columns,
    'Importance': hitters_feature_importances
}).sort_values(by='Importance', ascending=False)

pitchers_feature_importance_df = pd.DataFrame({
    'Feature': X_val_pitch.columns,
    'Importance': pitchers_feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the top 10 most important features for hitters and pitchers
print("Top 10 most important features for hitters:")
print(hitters_feature_importance_df.head(10))

print("\nTop 10 most important features for pitchers:")
print(pitchers_feature_importance_df.head(10))

Top 10 most important features for hitters:
     Feature  Importance
12        FV    0.158152
1        Fld    0.128054
18        Wt    0.087454
4        Org    0.074764
13        Ht    0.074427
17   Top 100    0.071089
3        Hit    0.070858
2       Game    0.067161
11     Bonus    0.064880
15  Sign Mkt    0.064323

Top 10 most important features for pitchers:
   Feature  Importance
13      Ht    0.196904
12      FV    0.178889
18      Wt    0.111948
5      Org    0.085500
11   Bonus    0.074292
2      CMD    0.073813
7       SL    0.073296
3       FB    0.064504
16       T    0.063721
1       CH    0.056556
