In [4]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import sys
from dataclasses import dataclass
from typing import List, Dict, Optional
sys.path.append('..')

# Define configs first to avoid pickle error
@dataclass 
class DataConfig:
    seq_length: int = 3
    min_seq_length: int = 1
    positions: List[str] = None
    
    def __post_init__(self):
        if self.positions is None:
            self.positions = ['1B', '2B', '3B', 'SS', 'LF', 'CF', 'RF', 'DH']

@dataclass
class ModelConfig:
    input_dim: int = 5  # inn, drs, uzr, oaa, age
    hidden_dim: int = 128
    num_layers: int = 2
    output_dim: int = 8  # num positions 
    dropout: float = 0.1
    
class PositionPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        return self.fc(lstm_out[:, -1, :])

# Load pretrained model
checkpoint = torch.load('../models/position_predictor.pt')
model_config = checkpoint['model_config'] 
data_config = checkpoint['data_config']

model = PositionPredictor(
    input_dim=model_config.input_dim,
    hidden_dim=model_config.hidden_dim, 
    num_layers=model_config.num_layers,
    output_dim=model_config.output_dim,
    dropout=model_config.dropout
)

model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Load predictions (2025-2039) 
years = range(2025, 2040)
fielding_predictions = {}

for year in years:
    filepath = f"../data/fielding_predictions_{year}.csv"
    fielding_predictions[year] = pd.read_csv(filepath)


# Calculate position percentages using IDfg
df_2025 = fielding_predictions[2025].copy()
df_2025['TotalInn'] = df_2025.groupby('IDfg')['Inn'].transform('sum')
df_2025['InnPct'] = df_2025['Inn'] / df_2025['TotalInn']

# Identify utility players
utility_mask = ~df_2025.groupby('IDfg')['InnPct'].transform(max).gt(0.8)
utility_ids = df_2025[utility_mask]['IDfg'].unique()

# Create filtered dataset
utility_data = {
    year: df[df['IDfg'].isin(utility_ids)] 
    for year, df in fielding_predictions.items()
}

# Display sample (with both IDfg and Name)
sample_display = df_2025[df_2025['IDfg'].isin(utility_ids[:5])].pivot_table(
    index=['IDfg', 'Name'],
    columns='Pos',
    values='InnPct',
    fill_value=0
).round(3)

print(f"Found {len(utility_ids)} utility players")
print("\nSample utility player position distributions:")
print(sample_display)

RuntimeError: Error(s) in loading state_dict for PositionPredictor:
	Missing key(s) in state_dict: "fc.weight", "fc.bias". 
	Unexpected key(s) in state_dict: "metric_embed.weight", "metric_embed.bias", "pos_embed.weight", "attention.q_proj.weight", "attention.q_proj.bias", "attention.k_proj.weight", "attention.k_proj.bias", "attention.v_proj.weight", "attention.v_proj.bias", "attention.o_proj.weight", "attention.o_proj.bias", "attention.norm.weight", "attention.norm.bias", "position_classifier.0.weight", "position_classifier.0.bias", "position_classifier.3.weight", "position_classifier.3.bias", "lstm.weight_ih_l0_reverse", "lstm.weight_hh_l0_reverse", "lstm.bias_ih_l0_reverse", "lstm.bias_hh_l0_reverse", "lstm.weight_ih_l1_reverse", "lstm.weight_hh_l1_reverse", "lstm.bias_ih_l1_reverse", "lstm.bias_hh_l1_reverse". 
	size mismatch for lstm.weight_ih_l0: copying a param with shape torch.Size([512, 1024]) from checkpoint, the shape in current model is torch.Size([512, 5]).
	size mismatch for lstm.weight_ih_l1: copying a param with shape torch.Size([512, 256]) from checkpoint, the shape in current model is torch.Size([512, 128]).