In [1]:
"""
MLB Trade Simulator - Value Determination Module
Author: Niels Christoffersen
Version: 1.0
Last Updated: 12/23/2024

This module calculates player values based on WAR projections and contract status.
It handles data loading, cleaning, and value calculations for MLB players.
"""

# Standard library imports
import os
from pathlib import Path
from typing import List, Dict, Optional

# Third-party imports
import pandas as pd
import numpy as np
from pandas import DataFrame
import logging
from enum import Enum
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Type aliases
PathLike = str | Path

# Global constants
ROOT_DIR = Path(os.getcwd()).parent  # Changed from Path(__file__).parent.parent
DATA_DIR = ROOT_DIR / 'data'
GENERATED_DIR = DATA_DIR / 'generated'
OUTPUT_DIR = GENERATED_DIR / 'value_by_year'
HITTER_COLUMNS = [
    'Name', 'Age', 'IDfg', 'BB%', 'K%', 'AVG', 'OBP', 'SLG', 'wOBA', 
    'wRC+', 'Off', 'BsR', 'Def', 'WAR', 'HR', '2B', '3B', 'SB', 'CS', 'R', 'RBI'
]

PITCHER_COLUMNS = [
    'Name', 'Age', 'IDfg', 'ERA','FIP', 'SIERA', 'K%', 'BB%', 'WAR'
]


class PlayerStatus(Enum):
    PRE_ARB = "Pre-ARB"
    ARB1 = "ARB1"
    ARB2 = "ARB2"
    ARB3 = "ARB3"
    FREE_AGENT = "FA"
    SIGNED = "Signed"
    UNKNOWN = "Unknown"

STATUS_MAPPINGS = {
    'PRE-ARB': PlayerStatus.PRE_ARB,
    'PRE ARB': PlayerStatus.PRE_ARB,
    'ROOKIE': PlayerStatus.PRE_ARB,
    'MIN': PlayerStatus.PRE_ARB,
    'ARB 1': PlayerStatus.ARB1,
    'ARB1': PlayerStatus.ARB1,
    'ARB 2': PlayerStatus.ARB2,
    'ARB2': PlayerStatus.ARB2,
    'ARB 3': PlayerStatus.ARB3,
    'ARB3': PlayerStatus.ARB3,
    'ARB 4': PlayerStatus.ARB3,
    'UFA': PlayerStatus.FREE_AGENT,
    'FA': PlayerStatus.FREE_AGENT,
}
# Ensure required directories exist
for directory in [DATA_DIR, GENERATED_DIR, OUTPUT_DIR]:
    directory.mkdir(parents=True, exist_ok=True)
    logger.debug(f"Verified directory exists: {directory}")

logger.info("Initialized MLB Trade Simulator value determination module")

2025-01-21 23:20:19,368 - INFO - Initialized MLB Trade Simulator value determination module


In [2]:
# Constants
PREDICTION_YEARS = range(2025, 2040)
REQUIRED_COLUMNS = {
    'predictions': ['Name', 'IDfg', 'Age', 'WAR'],
    'salary': ['Name', 'IDfg', 'Salary', 'Contract_Status']
}

def validate_files_exist(pattern: str, years: range) -> None:
    """Validate prediction files exist for given years."""
    missing_files = [
        f"{pattern}_{year}.csv" 
        for year in years 
        if not (GENERATED_DIR / f"{pattern}_{year}.csv").exists()
    ]
    if missing_files:
        raise FileNotFoundError(f"Missing files: {', '.join(missing_files)}")

def load_prediction_files(pattern: str, years: range = PREDICTION_YEARS) -> DataFrame:
    """Load and combine prediction CSVs with validation."""
    validate_files_exist(pattern, years)
    dfs = []
    
    for year in years:
        file_path = GENERATED_DIR / f"{pattern}_{year}.csv"
        logger.info(f"Loading {file_path}")
        
        try:
            df = pd.read_csv(file_path)
            missing_cols = set(REQUIRED_COLUMNS['predictions']) - set(df.columns)
            if missing_cols:
                raise ValueError(f"Missing columns in {file_path}: {missing_cols}")
                
            df['prediction_year'] = year
            dfs.append(df)
            
        except Exception as e:
            logger.error(f"Error loading {file_path}: {str(e)}")
            raise
    
    return pd.concat(dfs, ignore_index=True)

# Main data loading
try:
    sp_data = load_prediction_files('SP_Predictions')
    rp_data = load_prediction_files('RP_Predictions')
    batter_data = load_prediction_files('Batter_Predictions')
    salary_data = pd.read_csv(DATA_DIR / 'MLB_SALARY_DATA.csv')
    
    logger.info(f"Loaded {len(sp_data)} SP, {len(rp_data)} RP, "
                f"{len(batter_data)} batter, and {len(salary_data)} salary records")
    
except Exception as e:
    logger.error(f"Critical error during data loading: {str(e)}")
    raise

2025-01-21 23:20:19,402 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2025.csv
2025-01-21 23:20:19,414 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2026.csv


2025-01-21 23:20:19,422 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2027.csv
2025-01-21 23:20:19,431 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2028.csv
2025-01-21 23:20:19,441 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2029.csv
2025-01-21 23:20:19,450 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2030.csv
2025-01-21 23:20:19,459 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2031.csv
2025-01-21 23:20:19,469 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2032.csv
2025-01-21 23:20:19,475 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2033.csv
2025-01-21 23:20:19,484 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2034.csv
2025-01-21 23:20:19,491 - INFO - Loading c:\Users\User\Desktop\LSTMLB\data\generated\SP_Predictions_2035.csv
2025-01-21 23:20:19

def integrate_historical_data(batter_data: pd.DataFrame,
                            sp_data: pd.DataFrame,
                            rp_data: pd.DataFrame,
                            batting_history: pd.DataFrame,
                            pitching_history: pd.DataFrame) -> tuple:
    """Integrate historical stats with prediction data."""
    
    # Get player IDs from prediction data
    batter_ids = set(batter_data['IDfg'].unique())
    sp_ids = set(sp_data['IDfg'].unique())
    rp_ids = set(rp_data['IDfg'].unique())
    
    # Filter historical data to prediction players only
    batting_history = batting_history[batting_history['IDfg'].isin(batter_ids)]
    historical_sp = pitching_history[pitching_history['IDfg'].isin(sp_ids)]
    historical_rp = pitching_history[pitching_history['IDfg'].isin(rp_ids)]
    
    # Standardize column names
    sp_data = sp_data.copy().rename(columns={'Season': 'Year'})
    rp_data = rp_data.copy().rename(columns={'Season': 'Year'})
    batting_history = batting_history.rename(columns={'Season': 'Year'})
    historical_sp = historical_sp.rename(columns={'Season': 'Year'})
    historical_rp = historical_rp.rename(columns={'Season': 'Year'})
    
    # Get common columns
    batting_cols = [col for col in batter_data.columns if col in batting_history.columns]
    pitching_cols = [col for col in sp_data.columns if col in historical_sp.columns]
    
    # Combine data
    combined_batting = pd.concat([
        batter_data[batting_cols],
        batting_history[batting_cols]
    ], ignore_index=True)
    
    combined_sp = pd.concat([
        sp_data[pitching_cols],
        historical_sp[pitching_cols]
    ], ignore_index=True)
    
    combined_rp = pd.concat([
        rp_data[pitching_cols],
        historical_rp[pitching_cols]
    ], ignore_index=True)
    
    # Sort and deduplicate
    for df in [combined_batting, combined_sp, combined_rp]:
        df.sort_values(['IDfg', 'Year'], inplace=True)
        df.drop_duplicates(subset=['IDfg', 'Year'], keep='first', inplace=True)
    
    logger.info(f"Added {len(combined_batting) - len(batter_data)} historical batting records")
    logger.info(f"Added {len(combined_sp) - len(sp_data)} historical SP records")
    logger.info(f"Added {len(combined_rp) - len(rp_data)} historical RP records")
    
    return combined_batting, combined_sp, combined_rp

try:
    batting_history = pd.read_csv('../data/mlb_batting_data_2000_2024.csv')
    pitching_history = pd.read_csv('../data/mlb_pitching_data_2000_2024.csv')
    
    batter_data, sp_data, rp_data = integrate_historical_data(
        batter_data,
        sp_data,
        rp_data,
        batting_history,
        pitching_history
    )
    
except Exception as e:
    logger.error(f"Failed to integrate historical data: {str(e)}")
    raise

Group Positions, and merge data

In [3]:
def analyze_status_values(salary_df):
    """Print unique status values from salary data"""
    unique_statuses = salary_df['Status'].unique()
    print("Unique status values found:")
    print(unique_statuses)

analyze_status_values(salary_data)

Unique status values found:
[nan 'Pre-Arb' 'ARB 1' 'ARB 2' 'ARB 3' 'Club' 'UFA' 'Vesting' 'Opt-Out'
 'Player' 'Estimate' 'PRE-ARB' '-' 'ARB 4' 'Arb Avoided' '$3,516,480'
 '$16,000,000' '$8,000,000' '$9,000,000' '$3,500,000' '$9,500,000'
 '$860,000' '$820,000' 'ARB 1 (S2)' '$11,000,000' '$3,844,100'
 '$18,000,000' '$13,000,000' '$8,500,000' '$15,600,000' '$18,600,000'
 '$10,750,000' 'Mutual' '$2,250,000' '$14,000,000' '$26,000,000'
 '$28,000,000' '$27,000,000' '$21,000,000' '$19,000,000' '$8,600,000'
 '$17,100,000' '$15,100,000' '$1,950,000' '$1,750,000' 'RFA / QO'
 '$22,000,000' '$17,000,000' '$23,000,000' '$25,000,000' '$4,000,000'
 '$10,000,000' '$15,000,000' '$32,500,000' '$12,500,000' '$15,950,000'
 '$17,700,000' '$6,000,000' 'Arb Avoided #' '$17,666,667' '$37,166,667'
 '$37,216,667' '$37,116,666' '$37,116,674' '$10,015,872' '$28,071,428'
 '$36,571,428' '$38,571,428' '$38,571,432' '$21,225,000' '$2,370,968'
 '$2,500,000' '$2,000,000' '$22,500,000' '$30,000,000' '$35,000,000'
 'arbi

In [4]:
"""
Position Grouping and Data Merging
- Groups player positions into SP/RP/POS categories
- Merges prediction datasets
- Validates data quality
- Provides summary statistics
"""

# Add position grouping
sp_data['position_group'] = 'SP'
rp_data['position_group'] = 'RP'
batter_data['position_group'] = 'POS'

sp_data['Position'] = sp_data['position_group']
rp_data['Position'] = rp_data['position_group']

def merge_prediction_data(sp_df, rp_df, batter_df):
    """Merge prediction datasets with validation."""
    required_cols = [
        'Name', 'IDfg', 'position_group', 'Age', 
        'prediction_year', 'WAR', 'Position'
    ]
    
    logger.info(f"SP columns: {sp_df.columns.tolist()}")
    logger.info(f"RP columns: {rp_df.columns.tolist()}")
    logger.info(f"Batter columns: {batter_df.columns.tolist()}")
    
    # Combine datasets
    player_predictions = pd.concat([
        sp_df[sp_df.columns.intersection(required_cols + ['Position'])],
        rp_df[rp_df.columns.intersection(required_cols + ['Position'])],
        batter_df[batter_df.columns.intersection(required_cols + ['Position'])]
    ], ignore_index=True)
    
    # Verify Position exists
    if 'Position' not in player_predictions.columns:
        raise ValueError("Position column lost during merge")
        
    return player_predictions

# Merge data and generate summary
try:
    player_predictions = merge_prediction_data(sp_data, rp_data, batter_data)
    
    
    
    # Validate no missing values
    missing_values = player_predictions.isnull().sum()
    if missing_values.any():
        logger.warning(f"\nMissing values found:\n{missing_values[missing_values > 0]}")
        
    logger.info(f"Successfully merged {len(player_predictions)} player predictions")
    
except Exception as e:
    logger.error(f"Error merging prediction data: {str(e)}")
    raise

2025-01-21 23:20:20,118 - INFO - SP columns: ['Name', 'Season', 'Age', 'Role', 'IDfg', 'ERA', 'FIP', 'SIERA', 'K%', 'BB%', 'HR/9', 'SwStr%', 'Contact%', 'O-Swing%', 'Z-Contact%', 'F-Strike%', 'Zone%', 'CSW%', 'CStr%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Soft%', 'Med%', 'Hard%', 'FBv', 'IP', 'GS', 'G', 'WAR', 'prediction_year', 'position_group', 'Position']
2025-01-21 23:20:20,120 - INFO - RP columns: ['Name', 'Season', 'Age', 'Role', 'IDfg', 'ERA', 'FIP', 'SIERA', 'K%', 'BB%', 'HR/9', 'SwStr%', 'Contact%', 'O-Swing%', 'Z-Contact%', 'F-Strike%', 'Zone%', 'CSW%', 'CStr%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Soft%', 'Med%', 'Hard%', 'FBv', 'IP', 'GS', 'G', 'WAR', 'prediction_year', 'position_group', 'Position']
2025-01-21 23:20:20,122 - INFO - Batter columns: ['Name', 'Age', 'Year', 'IDfg', 'BB%', 'K%', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'def_value', 'Position', 'BsR', 'wSB', 'UBR', 'wGDP', 'SB', 'CS', 'Off', 'Def', 'WAR', 'PA', 'G', 'HR', '2B', '3B', 'RBI', 'R', 'prediction_year', '

In [5]:
"""
Salary Data Processing Module
- Cleans and standardizes salary data
- Preserves contract status information
- Handles missing and invalid values
- Validates data quality
"""

def clean_salary_data(df: DataFrame) -> DataFrame:
    """
    Clean and standardize salary data from Sportrac.
    
    Args:
        df (DataFrame): Raw salary data with Payroll and Status columns
        
    Returns:
        DataFrame: Cleaned salary data with standardized values
    """
    logger.info("Starting salary data cleaning process")
    
    cleaned_df = df.copy()
    
    try:
        # Remove non-player rows (options, buyouts, etc)
        cleaned_df = cleaned_df[~cleaned_df['Player Name'].str.contains(
            'OPT-OUT|UFA|PLAYER OPT|CLUB OPT', 
            na=False, 
            case=False
        )]
        
        # Clean Year column
        cleaned_df['Year'] = pd.to_numeric(cleaned_df['Year'], errors='coerce')
        cleaned_df = cleaned_df.dropna(subset=['Year'])
        
        # Clean Payroll column - two-step process
        payroll = (cleaned_df['Payroll']
                  .astype(str)
                  .str.replace('$', '', regex=False)
                  .str.replace(',', '', regex=False)
                  .str.replace('-', '', regex=False))
        
        cleaned_df['Payroll'] = pd.to_numeric(payroll, errors='coerce')
        
        # Status validation and cleaning
        if 'Status' not in cleaned_df.columns:
            logger.warning("Status column missing from input data")
        else:
            status_counts = cleaned_df['Status'].value_counts()
            logger.info("\nStatus distribution:")
            logger.info(status_counts)
        
        # Generate summary statistics
        stats = {
            'original_rows': len(df),
            'cleaned_rows': len(cleaned_df),
            'valid_salary_rows': cleaned_df['Payroll'].notna().sum(),
            'min_salary': cleaned_df['Payroll'].min(),
            'max_salary': cleaned_df['Payroll'].max(),
            'mean_salary': cleaned_df['Payroll'].mean()
        }
        
        logger.info("\nSalary cleaning summary:")
        for key, value in stats.items():
            logger.info(f"{key}: {value:,.2f}" if isinstance(value, float) else f"{key}: {value}")
            
        return cleaned_df[['Player Name', 'Year', 'Team', 'Payroll', 'Status']].copy()
        
    except Exception as e:
        logger.error(f"Error cleaning salary data: {str(e)}")
        raise

# Execute cleaning
try:
    salary_data_clean = clean_salary_data(salary_data)
    
    print("\nSample of cleaned salary data:")
    print(salary_data_clean.head())
    
    print("\nData validation:")
    print(f"Null values:\n{salary_data_clean.isnull().sum()}")
    
except Exception as e:
    logger.error(f"Failed to process salary data: {str(e)}")
    raise

2025-01-21 23:20:20,203 - INFO - Starting salary data cleaning process
2025-01-21 23:20:20,270 - INFO - 
Status distribution:
2025-01-21 23:20:20,272 - INFO - Status
Estimate       871
PRE-ARB        740
UFA            423
ARB 1          278
ARB 3          213
              ... 
$2,370,968       1
$36,571,428      1
$28,071,428      1
$10,015,872      1
$2,962,963       1
Name: count, Length: 134, dtype: int64
2025-01-21 23:20:20,276 - INFO - 
Salary cleaning summary:
2025-01-21 23:20:20,298 - INFO - original_rows: 3821
2025-01-21 23:20:20,299 - INFO - cleaned_rows: 3806
2025-01-21 23:20:20,300 - INFO - valid_salary_rows: 2043
2025-01-21 23:20:20,301 - INFO - min_salary: 250,000.00
2025-01-21 23:20:20,302 - INFO - max_salary: 51,875,000.00
2025-01-21 23:20:20,303 - INFO - mean_salary: 10,248,524.89



Sample of cleaned salary data:
      Player Name    Year Team     Payroll   Status
0  Corbin Carroll  2023.0  ari   1625000.0      NaN
1  Corbin Carroll  2024.0  ari   3625000.0  Pre-Arb
2  Corbin Carroll  2025.0  ari   5625000.0  Pre-Arb
3  Corbin Carroll  2026.0  ari  10625000.0    ARB 1
4  Corbin Carroll  2027.0  ari  12625000.0    ARB 2

Data validation:
Null values:
Player Name       0
Year              0
Team              0
Payroll        1763
Status          407
dtype: int64


def integrate_historical_data(sp_data: pd.DataFrame,
                            rp_data: pd.DataFrame,
                            batter_data: pd.DataFrame) -> tuple:
    """Integrate historical stats with prediction data."""
    
    # Load historical data
    batting_history = pd.read_csv('../data/mlb_batting_data_2000_2024.csv')
    pitching_history = pd.read_csv('../data/mlb_pitching_data_2000_2024.csv')
    
    # Drop rows missing Year/Season
    batting_history = batting_history.dropna(subset=['Season'])
    pitching_history = pitching_history.dropna(subset=['Season'])
    
    # Filter to prediction players and standardize columns
    batter_ids = set(batter_data['IDfg'].unique())
    sp_ids = set(sp_data['IDfg'].unique())
    rp_ids = set(rp_data['IDfg'].unique())
    
    # Process batting data
    batting_cols = list(set(batter_data.columns) & set(batting_history.columns))
    batting_history = (batting_history[batting_history['IDfg'].isin(batter_ids)]
                      .rename(columns={'Season': 'Year'})
                      [batting_cols])
    
    # Process pitching data
    pitching_cols = list(set(sp_data.columns) & set(pitching_history.columns))
    sp_history = (pitching_history[pitching_history['IDfg'].isin(sp_ids)]
                 .rename(columns={'Season': 'Year'})
                 [pitching_cols])
    rp_history = (pitching_history[pitching_history['IDfg'].isin(rp_ids)]
                 .rename(columns={'Season': 'Year'})
                 [pitching_cols])
    
    # Combine data
    batter_data = pd.concat([batter_data, batting_history], ignore_index=True)
    sp_data = pd.concat([sp_data, sp_history], ignore_index=True)
    rp_data = pd.concat([rp_data, rp_history], ignore_index=True)
    
    # Sort and deduplicate
    for df in [batter_data, sp_data, rp_data]:
        df.sort_values(['IDfg', 'Year'], inplace=True)
        df.drop_duplicates(subset=['IDfg', 'Year'], keep='first', inplace=True)
        df.dropna(subset=['Year'], inplace=True)
    
    return batter_data, sp_data, rp_data

# Update data with historical stats
batter_data, sp_data, rp_data = integrate_historical_data(sp_data, rp_data, batter_data)

In [6]:
print("SP columns:", sp_data.columns.tolist())
print("RP columns:", rp_data.columns.tolist())
print("Batter columns:", batter_data.columns.tolist())

SP columns: ['Name', 'Season', 'Age', 'Role', 'IDfg', 'ERA', 'FIP', 'SIERA', 'K%', 'BB%', 'HR/9', 'SwStr%', 'Contact%', 'O-Swing%', 'Z-Contact%', 'F-Strike%', 'Zone%', 'CSW%', 'CStr%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Soft%', 'Med%', 'Hard%', 'FBv', 'IP', 'GS', 'G', 'WAR', 'prediction_year', 'position_group', 'Position']
RP columns: ['Name', 'Season', 'Age', 'Role', 'IDfg', 'ERA', 'FIP', 'SIERA', 'K%', 'BB%', 'HR/9', 'SwStr%', 'Contact%', 'O-Swing%', 'Z-Contact%', 'F-Strike%', 'Zone%', 'CSW%', 'CStr%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'Soft%', 'Med%', 'Hard%', 'FBv', 'IP', 'GS', 'G', 'WAR', 'prediction_year', 'position_group', 'Position']
Batter columns: ['Name', 'Age', 'Year', 'IDfg', 'BB%', 'K%', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'def_value', 'Position', 'BsR', 'wSB', 'UBR', 'wGDP', 'SB', 'CS', 'Off', 'Def', 'WAR', 'PA', 'G', 'HR', '2B', '3B', 'RBI', 'R', 'prediction_year', 'position_group']


In [7]:
"""
Player Reference and ID Integration with Enhanced Name Matching
Handles UTF-8 encoding and accent normalization
"""

import unidecode
from thefuzz import fuzz

def normalize_name(name: str) -> str:
    """Normalize player names by removing accents and standardizing format."""
    if pd.isna(name):
        return name
    return unidecode.unidecode(str(name)).upper().strip()

# First standardize pitcher dataframes
sp_data = sp_data.rename(columns={'Season': 'Year'})
rp_data = rp_data.rename(columns={'Season': 'Year'})

def create_player_reference(sp_df: pd.DataFrame, 
                          rp_df: pd.DataFrame, 
                          batter_df: pd.DataFrame) -> pd.DataFrame:
    """Create unified player reference with normalized names."""
    player_ref = pd.concat([
        sp_df[['Name', 'IDfg', 'position_group', 'Year']],
        rp_df[['Name', 'IDfg', 'position_group', 'Year']],
        batter_df[['Name', 'IDfg', 'position_group', 'Year']]
    ])
    
    player_ref['Name_Normalized'] = player_ref['Name'].apply(normalize_name)
    return player_ref

def merge_salary_with_ids(salary_df: pd.DataFrame, player_ref: pd.DataFrame) -> pd.DataFrame:
    """Merge salary data with player reference, maintaining year-specific matches."""
    
    # Normalize salary data names
    salary_df['Name_Normalized'] = salary_df['Player Name'].apply(normalize_name)
    
    # Merge on both name and year
    merged_df = player_ref.merge(
        salary_df[['Name_Normalized', 'Year', 'Team', 'Payroll', 'Status']],
        on=['Name_Normalized', 'Year'],
        how='left'
    )
    
    # Keep only rows that have either Payroll or Status
    valid_data = merged_df[merged_df['Payroll'].notna() | merged_df['Status'].notna()]
    
    # Log merge statistics
    logger.info(f"\nMerge Results:")
    logger.info(f"Total rows in player_ref: {len(player_ref)}")
    logger.info(f"Total rows in salary_data: {len(salary_df)}")
    logger.info(f"Matched rows: {len(valid_data)}")
    logger.info(f"Rows with payroll: {valid_data['Payroll'].notna().sum()}")
    logger.info(f"Rows with status: {valid_data['Status'].notna().sum()}")
    
    return valid_data.drop('Name_Normalized', axis=1)

try:
    player_ref = create_player_reference(sp_data, rp_data, batter_data)
    salary_data_with_id = merge_salary_with_ids(salary_data_clean, player_ref)
    
    # Display unmatched players
    unmatched = salary_data_with_id[salary_data_with_id['IDfg'].isna()]
    if not unmatched.empty:
        print("\nSample unmatched players:")
        print(unmatched['Player Name'].unique()[:10])
        
except Exception as e:
    logger.error(f"Error in ID integration: {str(e)}")
    raise

2025-01-21 23:20:21,176 - INFO - 
Merge Results:
2025-01-21 23:20:21,178 - INFO - Total rows in player_ref: 17004
2025-01-21 23:20:21,179 - INFO - Total rows in salary_data: 3806
2025-01-21 23:20:21,180 - INFO - Matched rows: 2566
2025-01-21 23:20:21,182 - INFO - Rows with payroll: 1284
2025-01-21 23:20:21,184 - INFO - Rows with status: 2297


In [8]:
"""
Contract Status Processing Module
- Determines Free Agency years for all players
- Handles arbitration progression
- Processes contract options and UFA designations
- Validates contract timelines
"""

"""
Contract Status Processing with IDfg
Determines FA years based on latest available status
"""

def normalize_contract_status(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize MLB player contract statuses into standardized format.
    
    Process:
    1. Sort by player and year
    2. Check for long-term contracts
    3. Process status patterns in priority order
    4. Handle special cases (Estimate, Arb Avoided)
    
    Args:
        df: DataFrame must contain:
            - IDfg: Player ID
            - Year: Contract year
            - Status: Raw contract status
            - Payroll: Salary information
            
    Returns:
        DataFrame: Original DataFrame with new 'Normalized_Status' column
    """
    result_df = df.copy()
    result_df = result_df.sort_values(['IDfg', 'Year'])
    
    def has_long_term_contract(group):
        """Check if player has signed contract years"""
        # First check: Any future years with payroll but no status
        future_signed = group[
            (group['Payroll'].notna()) & 
            (group['Status'].isna())
        ]
        if len(future_signed) > 0:
            return True
            
        # Second check: Signed years beyond arb
        arb_years = group[group['Status'].str.contains('ARB', na=False, case=True)]
        if len(arb_years) > 0:
            last_arb_year = arb_years['Year'].max()
            future_signed = group[
                (group['Year'] > last_arb_year) & 
                (group['Payroll'].notna()) & 
                (group['Status'].isna())
            ]
            return len(future_signed) > 0
            
        return False
    
    def get_next_year_status(group):
        """Look ahead one year to determine current status for 'Estimate'"""
        group = group.copy()
        group['Next_Status'] = group['Status'].shift(-1)
        group['Has_Long_Contract'] = has_long_term_contract(group)
        return group
    
    def _normalize_single_status(row):
        """Normalize individual status values."""
        if pd.isna(row['Status']):
            if pd.notna(row['Payroll']) and row.get('Has_Long_Contract', False):
                return 'Signed'
            return 'Free Agent'
            
        status = str(row['Status']).upper().strip()
        if status == '-' and pd.notna(row['Payroll']):
            return 'Signed'
        # Handle dollar amounts
        if status.startswith('$'):
            return 'Signed'
            
        # Handle options
        if 'PLAYER' in status:
            return 'Player Option'
        if 'CLUB' in status:
            return 'Team Option'
        if 'MUTUAL' in status:
            return 'Mutual Option'
        if 'VESTING' in status:
            return 'Vesting Option'
        if 'OPT-OUT' in status:
            return 'Opt-Out'
            
        # Handle 'Estimate' based on next year's status
        if status == 'ESTIMATE':
            next_status = str(row['Next_Status']).upper().strip() if pd.notna(row['Next_Status']) else ''
            if any(x in next_status for x in ['UFA', 'FA']):
                return 'Arb-3'
            if 'ARB 1' in next_status:
                return 'Pre-Arb'
            if 'ARB 2' in next_status:
                return 'Arb-1'
            if 'ARB 3' in next_status:
                return 'Arb-2'
            if 'ARB 4' in next_status:
                return 'Arb-3'
            return 'Pre-Arb'
            
        # Handle 'Arb Avoided' - keep as arb status
        if 'AVOIDED' in status or 'BYPASSED' in status:
            next_status = str(row['Next_Status']).upper().strip() if pd.notna(row['Next_Status']) else ''
            if 'ARB 2' in next_status:
                return 'Arb-1'
            if 'ARB 3' in next_status:
                return 'Arb-2'
            if 'ARB 4' in next_status:
                return 'Arb-3'
            if pd.isna(row['Next_Status']):
                return 'Signed'
            return 'Arb-1'  # Default if can't determine
            
        # Handle regular arbitration
        if 'ARB' in status:
            if 'S2' in status:
                return 'Arb-1 (Super 2)'
            if 'ARB 4' in status:
                return 'Arb-4'
            if 'ARB 3' in status:
                return 'Arb-3'
            if 'ARB 2' in status:
                return 'Arb-2'
            if 'ARB 1' in status:
                return 'Arb-1'
                
        # Handle pre-arbitration
        if 'PRE' in status and 'ARB' in status:
            return 'Pre-Arb'
            
        # Handle free agency
        if any(x in status for x in ['UFA', 'RFA', 'FA', 'FREE AGENT']):
            return 'Free Agent'
            
        return 'Unknown'
    
    # Process by player group
    result_df = result_df.groupby('IDfg', group_keys=False).apply(get_next_year_status)
    
    # Apply normalization
    result_df['Normalized_Status'] = result_df.apply(_normalize_single_status, axis=1)
    
    # Log distribution
    status_counts = result_df['Normalized_Status'].value_counts()
    logger.info("\nStatus distribution after normalization:")
    for status, count in status_counts.items():
        logger.info(f"{status}: {count}")
        
    return result_df.drop(['Next_Status', 'Has_Long_Contract'], axis=1)

try:
    contract_data = normalize_contract_status(salary_data_with_id)
    

    
except Exception as e:
    logger.error(f"Failed to process contract statuses: {str(e)}")
    raise

2025-01-21 23:20:23,105 - INFO - 
Status distribution after normalization:
2025-01-21 23:20:23,106 - INFO - Pre-Arb: 717
2025-01-21 23:20:23,107 - INFO - Signed: 505
2025-01-21 23:20:23,108 - INFO - Free Agent: 409
2025-01-21 23:20:23,109 - INFO - Arb-1: 302
2025-01-21 23:20:23,110 - INFO - Arb-3: 225
2025-01-21 23:20:23,112 - INFO - Arb-2: 164
2025-01-21 23:20:23,112 - INFO - Team Option: 92
2025-01-21 23:20:23,113 - INFO - Arb-4: 54
2025-01-21 23:20:23,114 - INFO - Arb-1 (Super 2): 40
2025-01-21 23:20:23,114 - INFO - Player Option: 18
2025-01-21 23:20:23,115 - INFO - Opt-Out: 18
2025-01-21 23:20:23,115 - INFO - Vesting Option: 11
2025-01-21 23:20:23,116 - INFO - Mutual Option: 10
2025-01-21 23:20:23,117 - INFO - Unknown: 1


In [9]:
def generate_contract_timeline(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate complete contract timeline for each player.
    """
    result_df = df.copy()
    
    def process_player_timeline(group):
        player_rows = group.copy()
        
        # Check for Super 2
        is_super2 = any('Super 2' in str(status) for status in player_rows['Normalized_Status'])
        
        # Get latest year and status
        latest_year = player_rows['Year'].max()
        latest_status_series = player_rows.loc[player_rows['Year'] == latest_year, 'Normalized_Status']
        
        if len(latest_status_series) == 0:
            logger.warning(f"No status found for player {player_rows['IDfg'].iloc[0]} in year {latest_year}")
            return player_rows
            
        latest_status = latest_status_series.iloc[0]
        new_rows = []
        
        if latest_status == 'Free Agent':
            return player_rows
            
        # Modified status checking
        if latest_status.startswith('Arb-4'):
            new_rows.append({'Year': latest_year + 1, 'Normalized_Status': 'Free Agent'})
            
        elif latest_status.startswith('Arb-3'):
            if is_super2:
                new_rows.append({'Year': latest_year + 1, 'Normalized_Status': 'Arb-4'})
                new_rows.append({'Year': latest_year + 2, 'Normalized_Status': 'Free Agent'})
            else:
                new_rows.append({'Year': latest_year + 1, 'Normalized_Status': 'Free Agent'})
                
        elif latest_status.startswith('Arb-2'):
            new_rows.append({'Year': latest_year + 1, 'Normalized_Status': 'Arb-3'})
            if is_super2:
                new_rows.append({'Year': latest_year + 2, 'Normalized_Status': 'Arb-4'})
                new_rows.append({'Year': latest_year + 3, 'Normalized_Status': 'Free Agent'})
            else:
                new_rows.append({'Year': latest_year + 2, 'Normalized_Status': 'Free Agent'})
                
        elif latest_status.startswith('Arb-1'):
            arb2_status = 'Arb-2'
            new_rows.append({'Year': latest_year + 1, 'Normalized_Status': arb2_status})
            new_rows.append({'Year': latest_year + 2, 'Normalized_Status': 'Arb-3'})
            if is_super2:
                new_rows.append({'Year': latest_year + 3, 'Normalized_Status': 'Arb-4'})
                new_rows.append({'Year': latest_year + 4, 'Normalized_Status': 'Free Agent'})
            else:
                new_rows.append({'Year': latest_year + 3, 'Normalized_Status': 'Free Agent'})
                
        elif latest_status == 'Pre-Arb':
            pre_arb_years = len(player_rows[player_rows['Normalized_Status'] == 'Pre-Arb'])
            remaining_pre_arb = 3 - pre_arb_years
            
            current_year = latest_year
            for i in range(remaining_pre_arb):
                current_year += 1
                new_rows.append({'Year': current_year, 'Normalized_Status': 'Pre-Arb'})
            
            arb1_status = 'Arb-1 (Super 2)' if is_super2 else 'Arb-1'
            new_rows.append({'Year': current_year + 1, 'Normalized_Status': arb1_status})
            new_rows.append({'Year': current_year + 2, 'Normalized_Status': 'Arb-2'})
            new_rows.append({'Year': current_year + 3, 'Normalized_Status': 'Arb-3'})
            
            if is_super2:
                new_rows.append({'Year': current_year + 4, 'Normalized_Status': 'Arb-4'})
                new_rows.append({'Year': current_year + 5, 'Normalized_Status': 'Free Agent'})
            else:
                new_rows.append({'Year': current_year + 4, 'Normalized_Status': 'Free Agent'})
        
        # Add new rows to player timeline
        if new_rows:
            for row in new_rows:
                row.update({col: group.iloc[0][col] for col in group.columns 
                        if col not in ['Year', 'Normalized_Status', 'Status', 'Payroll']})
            return pd.concat([player_rows, pd.DataFrame(new_rows)], ignore_index=True)
        
        return player_rows
    
    # Process each player
    result_df = result_df.groupby('IDfg', group_keys=False).apply(process_player_timeline)
    
    return result_df.sort_values(['IDfg', 'Year']).reset_index(drop=True)

# Execute timeline generation
try:
    contract_timeline = generate_contract_timeline(contract_data)
    logger.info(f"Generated {len(contract_timeline)} timeline records")
except Exception as e:
    logger.error(f"Timeline generation failed: {str(e)}")
    raise

2025-01-21 23:20:24,794 - INFO - Generated 3794 timeline records


In [10]:
# Validate all players have FA year
try:
    # Group by IDfg and check if any player is missing FA status
    missing_fa = []
    for idfg, group in contract_timeline.groupby('IDfg'):
        if not any(group['Normalized_Status'] == 'Free Agent'):
            player_name = group['Name'].iloc[0]
            last_status = group.sort_values('Year')['Normalized_Status'].iloc[-1]
            missing_fa.append({
                'IDfg': idfg,
                'Player': player_name,
                'Last Status': last_status,
                'Last Year': group['Year'].max()
            })
    
    if missing_fa:
        print("\nPlayers missing Free Agent status:")
        missing_fa_df = pd.DataFrame(missing_fa)
        print(missing_fa_df.to_string())
        logger.warning(f"Found {len(missing_fa)} players missing FA status")
    else:
        logger.info("All players have Free Agent status")

except Exception as e:
    logger.error(f"FA validation failed: {str(e)}")
    raise




Players missing Free Agent status:
    IDfg            Player Last Status  Last Year
0  12791     Michael Kelly     Unknown       2025
1  15274     Mike Tauchman      Signed       2025
2  19600  Bryan De La Cruz      Signed       2025
3  20123         Juan Soto      Signed       2039


In [11]:
def extend_fa_timeline(timeline_df: pd.DataFrame) -> pd.DataFrame:
    """Extend timeline beyond first FA year through 2039."""
    
    # Find first FA year for each player
    fa_years = (timeline_df[timeline_df['Normalized_Status'] == 'Free Agent']
                .groupby('IDfg')['Year']
                .min()
                .reset_index())
    
    # Generate future FA rows
    future_rows = []
    for _, row in fa_years.iterrows():
        idfg = row['IDfg']
        start_year = int(row['Year']) + 1
        player_info = timeline_df[timeline_df['IDfg'] == idfg].iloc[0]
        
        for year in range(start_year, 2040):
            future_rows.append({
                'Name': player_info['Name'],
                'IDfg': idfg,
                'position_group': player_info['position_group'],
                'Year': year,
                'Team': np.nan,
                'Payroll': np.nan,
                'Status': np.nan,
                'Normalized_Status': 'Free Agent'
            })
    
    # Add new rows to timeline
    extended_timeline = pd.concat([
        timeline_df,
        pd.DataFrame(future_rows)
    ])
    
    # Sort and deduplicate
    extended_timeline = (extended_timeline
                       .sort_values(['IDfg', 'Year'])
                       .drop_duplicates(subset=['IDfg', 'Year'], keep='first'))
    
    return extended_timeline
extended_timeline=extend_fa_timeline(contract_timeline)

In [12]:
"""
WAR Value Calculation Module
Applies tiered WAR values and inflation adjustments
"""

# Constants
WAR_VALUE_TIERS = {
    'tier1': {'max': 2, 'value': 8_000_000},
    'tier2': {'max': 4, 'value': 9_000_000},
    'tier3': {'value': 10_000_000}
}
INFLATION_RATE = 0.04
BASE_YEAR = 2025

def calculate_inflation_multiplier(year: int) -> float:
    """Calculate inflation multiplier from base year."""
    return (1 + INFLATION_RATE) ** (year - BASE_YEAR)

def calculate_war_value(war: float, year: int) -> float:
    """
    Calculate WAR value using tiered system and inflation.
    
    Args:
        war (float): WAR value
        year (int): Year for inflation adjustment
    """
    if pd.isna(war) or war <= 0:
        return 0.0
        
    value = 0.0
    remaining_war = war
    
    # Tier 1: 0-2 WAR
    tier1_war = min(remaining_war, WAR_VALUE_TIERS['tier1']['max'])
    value += tier1_war * WAR_VALUE_TIERS['tier1']['value']
    remaining_war -= tier1_war
    
    if remaining_war <= 0:
        return value * calculate_inflation_multiplier(year)
        
    # Tier 2: 2-4 WAR
    tier2_war = min(remaining_war, WAR_VALUE_TIERS['tier2']['max'] - WAR_VALUE_TIERS['tier1']['max'])
    value += tier2_war * WAR_VALUE_TIERS['tier2']['value']
    remaining_war -= tier2_war
    
    if remaining_war <= 0:
        return value * calculate_inflation_multiplier(year)
        
    # Tier 3: 4+ WAR
    value += remaining_war * WAR_VALUE_TIERS['tier3']['value']
    
    return value * calculate_inflation_multiplier(year)

try:
    # Join predictions with timeline
    timeline_with_war = extended_timeline.merge(
        player_predictions[['IDfg', 'prediction_year', 'WAR']],
        left_on=['IDfg', 'Year'],
        right_on=['IDfg', 'prediction_year'],
        how='left'
    )
    
    # Calculate WAR values
    timeline_with_war['Base_Value'] = timeline_with_war.apply(
        lambda x: calculate_war_value(x['WAR'], x['Year']), 
        axis=1
    )
    
    # Clean up and validate
    timeline_with_war = timeline_with_war.drop('prediction_year', axis=1)
    
    logger.info(f"Processed {len(timeline_with_war)} rows")
    logger.info(f"Average WAR value: ${timeline_with_war['Base_Value'].mean():,.2f}")
    
except Exception as e:
    logger.error(f"Failed to calculate WAR values: {str(e)}")
    raise

2025-01-21 23:20:25,786 - INFO - Processed 11493 rows
2025-01-21 23:20:25,787 - INFO - Average WAR value: $8,731,306.66


In [13]:
"""
Contract Value Calculator
Determines player contract values based on:
1. Existing payroll data (if available)
2. Contract status (Pre-ARB, ARB1-3, FA)
3. WAR-based market value
"""

MIN_SALARY = {
    'Pre-Arb': 720000,
    'Arb-1': 1000000,
    'Arb-1 (Super 2)': 1200000,  # Higher floor for Super 2
    'Arb-2': 2500000,
    'Arb-3': 4000000,
    'Arb-4': 5000000
}
ARB_PERCENT = {
    'Arb-1': 0.25,
    'Arb-1 (Super 2)': 0.25,
    'Arb-2': 0.33,
    'Arb-3': 0.50,
    'Arb-4': 0.65
}

def calculate_contract_value(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate contract values ensuring arbitration values never decrease."""
    
    # Create copy to avoid modifying original
    result = df.copy()
    
    # Sort by player and year for tracking
    result = result.sort_values(['IDfg', 'Year'])
    
    # Initialize contract value column
    result['contract_value'] = np.nan
    
    # Process each player separately
    for player_id in result['IDfg'].unique():
        player_mask = result['IDfg'] == player_id
        player_data = result[player_mask].copy()
        
        prev_value = 0
        for idx, row in player_data.iterrows():
            current_value = row['Base_Value']
            status = row['Normalized_Status']

            # Handle existing payroll data first
            if pd.notna(row['Payroll']):
                contract_value = float(row['Payroll'])
            
            # Pre-Arb cases
            elif status == 'Pre-Arb':
                contract_value = max(MIN_SALARY['Pre-Arb'], prev_value)
            
            # Handle different arbitration years
            elif status in ARB_PERCENT:
                min_salary = MIN_SALARY.get(status, MIN_SALARY['Arb-1'])
                arb_pct = ARB_PERCENT[status]
                contract_value = max(
                    min_salary,
                    current_value * arb_pct,
                    prev_value
                )
            
            # Free Agent or other status
            else:
                contract_value = None
            
            # Update the value in the result dataframe
            result.loc[idx, 'contract_value'] = contract_value
            
            # Update previous value if we have a valid contract value
            if pd.notna(contract_value):
                prev_value = contract_value
    
    # Calculate average (excluding NaN values)
    valid_contracts = result['contract_value'].dropna()
    if len(valid_contracts) > 0:
        logger.info(f"Processed {len(result)} rows")
        logger.info(f"Average contract value: ${valid_contracts.mean():,.2f}")
    
    return result

try:
    # Calculate contract values
    timeline_with_values = calculate_contract_value(timeline_with_war)
    
    # Validate results
    valid_contracts = timeline_with_values['contract_value'].notna().sum()
    logger.info(f"Processed {len(timeline_with_values)} rows")
    logger.info(f"Contract values calculated: {valid_contracts}")
    
except Exception as e:
    logger.error(f"Failed to calculate contract values: {str(e)}")
    raise

2025-01-21 23:20:28,621 - INFO - Processed 11493 rows
2025-01-21 23:20:28,622 - INFO - Average contract value: $7,285,880.64
2025-01-21 23:20:28,624 - INFO - Processed 11493 rows
2025-01-21 23:20:28,625 - INFO - Contract values calculated: 3032


In [14]:
"""
Calculate Surplus Value
Surplus = Base Value - Contract Value
Only calculated for rows with existing contract values
"""

try:
    # Verify contract_value exists
    if 'contract_value' not in timeline_with_values.columns:
        raise ValueError("contract_value column not found in dataframe")
        
    # Calculate surplus value only where contract_value exists
    timeline_with_values['surplus_value'] = np.where(
        timeline_with_values['contract_value'].notna(),
        timeline_with_values['Base_Value'] - timeline_with_values['contract_value'],
        np.nan
    )
    
    # Validate results
    valid_surplus = timeline_with_values['surplus_value'].notna().sum()
    avg_surplus = timeline_with_values['surplus_value'].mean()
    
    logger.info(f"Calculated {valid_surplus} surplus values")
    logger.info(f"Average surplus value: ${avg_surplus:,.2f}")
    
except Exception as e:
    logger.error(f"Failed to calculate surplus values: {str(e)}")
    raise

2025-01-21 23:20:28,656 - INFO - Calculated 3032 surplus values
2025-01-21 23:20:28,657 - INFO - Average surplus value: $5,237,738.84


In [15]:
"""
Integrate Historical Data (2002-2024)
Add historical stats for players in prediction set
"""
HISTORICAL_WAR_VALUE = {
    2002: 4800000,  # backfilled with 2005 value
    2003: 4800000,  # backfilled with 2005 value
    2004: 4800000,  # backfilled with 2005 value
    2005: 4800000,
    2006: 5200000,  # interpolated
    2007: 5700000,
    2008: 6200000,
    2009: 6400000,
    2010: 6000000,
    2011: 7500000,
    2012: 6500000,
    2013: 7400000,
    2014: 7600000,
    2015: 8000000,
    2016: 8000000,
    2017: 7900000,
    2018: 8000000,
    2019: 8100000,
    2020: 7900000,
    2021: 8100000,
    2022: 8200000,
    2023: 8100000,
    2024: 8200000  # current value
}

# Default value for future years
WAR_VALUE = 8200000

def get_war_value(year: int) -> float:
    """Get WAR value for specific year, default to current WAR_VALUE if not found."""
    return HISTORICAL_WAR_VALUE.get(year, WAR_VALUE)

def integrate_historical_stats(timeline_df: pd.DataFrame) -> pd.DataFrame:
    """Add historical stats (2002-2024) for prediction players."""
    
    # Load historical data
    batting_history = pd.read_csv('../data/mlb_batting_data_2000_2024.csv')
    pitching_history = pd.read_csv('../data/mlb_pitching_data_2000_2024.csv')
    
    # Get current players info
    current_players = (timeline_df[['IDfg', 'Name', 'position_group']]
                      .drop_duplicates(subset=['IDfg']))
    
    # Format batting data
    batter_cols = ['IDfg', 'Season', 'Name', 'Team', 'WAR', 'BB%', 'K%', 'AVG', 
                   'OBP', 'SLG', 'OPS', 'wOBA', 'wRC+', 'Off', 'BsR', 'Def', 'Age', 'HR', '2B', '3B', 'R', 'RBI', 'SB', 'CS']
    batting_history = (batting_history[batting_history['IDfg'].isin(current_players['IDfg'])]
                      [batter_cols]
                      .rename(columns={'Season': 'Year', 'WAR': 'WAR_batter', 
                                     'BB%': 'BB%_bat', 'K%': 'K%_bat'}))
    
    # Format pitching data
    pitcher_cols = ['IDfg', 'Season', 'Name', 'Team', 'WAR', 'ERA', 'FIP', 'SIERA',
                    'K%', 'BB%', 'Age']
    pitching_history = (pitching_history[pitching_history['IDfg'].isin(current_players['IDfg'])]
                       [pitcher_cols]
                       .rename(columns={'Season': 'Year', 'WAR': 'WAR_pitcher',
                                      'K%': 'K%_pit', 'BB%': 'BB%_pit'}))
    
    # Merge batting and pitching data
    historical = (batting_history.merge(pitching_history, 
                                      on=['IDfg', 'Year', 'Name', 'Team', 'Age'],
                                      how='outer'))
    
    # Add position info from current data
    historical = historical.merge(current_players[['IDfg', 'position_group']], 
                                on='IDfg')
    
    # Fill NaN WAR values with 0
    historical['WAR_batter'] = historical['WAR_batter'].fillna(0)
    historical['WAR_pitcher'] = historical['WAR_pitcher'].fillna(0)
    
    # Calculate total WAR
    historical['WAR'] = historical['WAR_batter'] + historical['WAR_pitcher']
    
    # Add status columns
    historical['Status'] = 'NA'
    historical['Normalized_Status'] = 'NA'
    historical['Payroll'] = np.nan
    
    # Calculate base value
    historical['Base_Value'] = historical.apply(
        lambda x: x['WAR'] * get_war_value(int(x['Year'])), axis=1
    )
    historical['Contract_Value'] = np.nan
    historical['surplus_value'] = np.nan
    
    # Combine with timeline
    complete_timeline = pd.concat([timeline_df, historical])
    
    # Sort and remove duplicates
    complete_timeline = (complete_timeline
                       .sort_values(['IDfg', 'Year'])
                       .drop_duplicates(subset=['IDfg', 'Year']))
    
    return complete_timeline

try:
    timeline_with_history = integrate_historical_stats(timeline_with_values)
    logger.info(f"Added historical records. New shape: {timeline_with_history.shape}")
    
except Exception as e:
    logger.error(f"Failed to integrate historical data: {str(e)}")
    raise

2025-01-21 23:20:31,132 - INFO - Added historical records. New shape: (14872, 39)


In [16]:
def integrate_player_statistics(value_data, batter_data, sp_data, rp_data):
    """Integrate stats with combined positions for two-way players"""
    
    # Split data
    historical_data = value_data[value_data['Year'] < 2025].copy()
    prediction_data = value_data[value_data['Year'] >= 2025].copy()
    
    # Clean prediction data - keep only essential columns
    essential_cols = ['Name', 'IDfg', 'position_group', 'Year', 'Team', 
                     'Payroll', 'Status', 'Normalized_Status', 'WAR', 'Base_Value',
                     'contract_value', 'surplus_value']
    prediction_data = prediction_data[essential_cols].copy()
    
    
    # Find two-way players
    batter_ids = set(batter_data['IDfg'].unique())
    pitcher_ids = set(sp_data['IDfg'].unique()) | set(rp_data['IDfg'].unique())
    two_way_players = batter_ids.intersection(pitcher_ids)
    print(f"Found {len(two_way_players)} two-way players")
    
    # Add two-way flag
    prediction_data['Two_Way'] = prediction_data['IDfg'].isin(two_way_players)
    
    # Merge batter stats
    batter_stats = (batter_data[['IDfg', 'prediction_year', 'WAR', 'Position'] + 
                   [col for col in HITTER_COLUMNS if col not in ['Name', 'IDfg', 'WAR', 'Position']]]\
                   .rename(columns={
                       'prediction_year': 'Year',
                       'BB%': 'BB%_bat',
                       'K%': 'K%_bat',
                       'Age': 'Age_bat',
                       'WAR': 'WAR_batter',
                       'Position': 'Position_batter'
                   }))
    
    # Merge pitcher stats
    pitcher_stats = (pd.concat([
        sp_data[['IDfg', 'prediction_year', 'WAR', 'Position'] + 
                [col for col in PITCHER_COLUMNS if col not in ['Name', 'IDfg', 'WAR', 'Position']]],
        rp_data[['IDfg', 'prediction_year', 'WAR', 'Position'] + 
                [col for col in PITCHER_COLUMNS if col not in ['Name', 'IDfg', 'WAR', 'Position']]]
    ])\
    .rename(columns={
        'prediction_year': 'Year',
        'BB%': 'BB%_pit',
        'K%': 'K%_pit',
        'Age': 'Age_pit',
        'WAR': 'WAR_pitcher',
        'Position': 'Position_pitcher'
    })\
    .drop_duplicates(subset=['IDfg', 'Year']))
    
    # Merge stats
    prediction_data = prediction_data.merge(batter_stats, on=['IDfg', 'Year'], how='left')
    prediction_data = prediction_data.merge(pitcher_stats, on=['IDfg', 'Year'], how='left')
    
    # Handle positions and WAR for two-way players
    mask = prediction_data['Two_Way']
    
    # Combine positions
    prediction_data.loc[mask, 'Position'] = prediction_data.loc[mask].apply(
        lambda x: f"{x['Position_pitcher']}/{x['Position_batter']}" if pd.notna(x['Position_pitcher']) else x['Position_batter'],
        axis=1
    )
    
    # Single position for non-two-way players
    prediction_data.loc[~mask, 'Position'] = prediction_data.loc[~mask, 'Position_batter'].fillna(prediction_data.loc[~mask, 'Position_pitcher'])
    
    # Handle WAR
    prediction_data.loc[mask, 'WAR'] = (
        prediction_data.loc[mask, 'WAR_batter'].fillna(0) + 
        prediction_data.loc[mask, 'WAR_pitcher'].fillna(0)
    )
    prediction_data.loc[~mask, 'WAR'] = prediction_data.loc[~mask, 'WAR_batter'].fillna(prediction_data.loc[~mask, 'WAR_pitcher'])
    
    # Clean up columns
    prediction_data = prediction_data.drop(['Position_batter', 'Position_pitcher'], axis=1)
    prediction_data['Age'] = prediction_data['Age_bat'].fillna(prediction_data['Age_pit'])
    prediction_data = prediction_data.drop(['Age_bat', 'Age_pit'], axis=1)
    
    # Combine and sort
    result = pd.concat([historical_data, prediction_data])
    return result.sort_values(['IDfg', 'Year'])

# Execute
try:
    export_data = integrate_player_statistics(
        timeline_with_history,
        batter_data,
        sp_data, 
        rp_data
    )
    print(f"Records processed: {len(export_data)}")
    print(f"Columns: {export_data.columns.tolist()}")
    
except Exception as e:
    logger.error(f"Error: {str(e)}")
    raise

Found 1 two-way players
Records processed: 14872
Columns: ['Name', 'IDfg', 'position_group', 'Year', 'Team', 'Payroll', 'Status', 'Normalized_Status', 'WAR', 'Base_Value', 'contract_value', 'surplus_value', 'WAR_batter', 'BB%_bat', 'K%_bat', 'AVG', 'OBP', 'SLG', 'OPS', 'wOBA', 'wRC+', 'Off', 'BsR', 'Def', 'Age', 'HR', '2B', '3B', 'R', 'RBI', 'SB', 'CS', 'WAR_pitcher', 'ERA', 'FIP', 'SIERA', 'K%_pit', 'BB%_pit', 'Contract_Value', 'Two_Way', 'Position']


In [17]:
#replace status with normalized status
export_data['Status']=export_data['Normalized_Status']
export_data=export_data.drop('Normalized_Status',axis=1)
export_data=export_data.drop('Contract_Value',axis=1)
export_data=export_data.drop('Payroll',axis=1)

In [18]:
def analyze_contract_options(df: pd.DataFrame) -> pd.DataFrame:
    """Add FA year and probable FA year analysis."""
    result = df.copy()
    
    # Find base FA year
    fa_years = (result[result['Status'] == 'Free Agent']
                .groupby('IDfg')['Year']
                .min()
                .reset_index()
                .rename(columns={'Year': 'FA_Year'}))
    
    result = result.merge(fa_years, on='IDfg', how='left')
    result['probable_fa_year'] = result['FA_Year']
    
    # Find players with any type of option
    option_types = ['Player Option', 'Team Option', 'Mutual Option', 'Vesting Option', 'Opt-Out']
    
    # Set earliest_fa_year to option year if exists, otherwise FA_Year
    option_years = (result[result['Status'].isin(option_types)]
                   .groupby('IDfg')['Year']
                   .min()
                   .reset_index()
                   .rename(columns={'Year': 'option_year'}))
    
    result['earliest_fa_year'] = result['FA_Year']
    result = result.merge(option_years, on='IDfg', how='left')
    result.loc[result['option_year'].notna(), 'earliest_fa_year'] = result.loc[result['option_year'].notna(), 'option_year']
    
    # Process each option type
    for player_id in result[result['Status'].isin(option_types)]['IDfg'].unique():
        player_data = result[result['IDfg'] == player_id].sort_values('Year')
        option_status = player_data[player_data['Status'].isin(option_types)]['Status'].iloc[0]
        option_year = player_data[player_data['Status'].isin(option_types)]['Year'].min()
        fa_year = player_data['FA_Year'].iloc[0]
        
        # Calculate surplus sum from option year to FA year
        surplus_sum = player_data[
            (player_data['Year'] >= option_year) & 
            (player_data['Year'] < fa_year)
        ]['surplus_value'].sum()
        
        # Apply option-specific logic
        if option_status in ['Player Option', 'Opt-Out']:  # Treating Opt-Out like Player Option
            if surplus_sum > 0:  # Player opts out if positive surplus
                result.loc[result['IDfg'] == player_id, 'probable_fa_year'] = option_year
        elif option_status == 'Team Option':
            if surplus_sum < 0:  # Team declines if negative surplus
                result.loc[result['IDfg'] == player_id, 'probable_fa_year'] = option_year
        else:  # Other option types (Mutual, Vesting)
            if surplus_sum < 0:  # Option declined if negative surplus
                result.loc[result['IDfg'] == player_id, 'probable_fa_year'] = option_year
    
    # Clean up temporary column
    result = result.drop('option_year', axis=1, errors='ignore')
    
    return result

try:
    export_data = analyze_contract_options(export_data)
    
    # Verify results
    option_examples = export_data[
        export_data['FA_Year'] != export_data['probable_fa_year']
    ][['Name', 'Year', 'Status', 'surplus_value', 'FA_Year', 'probable_fa_year']].head()
    
    print("\nExample players with adjusted FA years:")
    print(option_examples)
    
except Exception as e:
    logger.error(f"Failed to analyze options: {str(e)}")
    raise


Example players with adjusted FA years:
                 Name  Year Status  surplus_value  FA_Year  probable_fa_year
86  Giancarlo Stanton  2010     NA            NaN   2029.0            2028.0
87  Giancarlo Stanton  2011     NA            NaN   2029.0            2028.0
88  Giancarlo Stanton  2012     NA            NaN   2029.0            2028.0
89  Giancarlo Stanton  2013     NA            NaN   2029.0            2028.0
90  Giancarlo Stanton  2014     NA            NaN   2029.0            2028.0


In [19]:
# Round and handle negative values, preserving NaN
columns_to_process = ['HR', '2B', '3B', 'RBI', 'R', 'SB', 'CS']

for col in columns_to_process:
    # Handle negative values first, preserve NaN
    export_data[col] = export_data[col].apply(lambda x: max(x, 0) if pd.notna(x) else x)
    # Round values, preserve NaN
    export_data[col] = export_data[col].apply(lambda x: round(x) if pd.notna(x) else x)


In [20]:
#add ops, only for columns where obp and slg are not null
export_data['OPS']=np.where(export_data['OBP'].notna() & export_data['SLG'].notna(),export_data['OBP']+export_data['SLG'],np.nan)

In [21]:
"""
Value Export Module
Exports yearly player valuations sorted by team and WAR
"""

def export_value_data(df: pd.DataFrame, output_dir: Path) -> None:
    """Export sorted value data by year."""
    logger.info("Starting value data export")
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Define column groups
    base_cols = [
        'Player Name', 'Team', 'Status', 'Position', 'Age', 'WAR',
        'Base_Value', 'Contract_Value', 'Surplus_Value', 'IDfg', 'Year', 'FA_Year', 'Probable_FA_Year', 'Earliest_FA_Year',
    ]
    
    hitting_cols = [
        'BB%_bat', 'K%_bat', 'AVG', 'OBP', 'SLG', 'OPS',
        'wOBA', 'wRC+', 'Off', 'BsR', 'Def', 'WAR_batter', 'HR', '2B', '3B', 'SB', 'CS', 'R', 'RBI'
    ]
    
    pitching_cols = [
        'ERA', 'FIP', 'SIERA', 'K%_pit', 'BB%_pit', 'WAR_pitcher'
    ]
    
    export_cols = base_cols + hitting_cols + pitching_cols
    
    try:
        # Create copy for export
        export_df = df.copy()
        
        # Rename Name column
        export_df = export_df.rename(columns={'Name': 'Player_Name'})
        
        # Sort data
        export_df = export_df.sort_values(['Year', 'Team', 'WAR'], 
                                        ascending=[True, True, False])
        
        # Round numeric columns
        numeric_cols = ['Base_Value', 'Contract_Value', 'surplus_value', 'WAR']
        for col in numeric_cols:
            if col in export_df.columns:
                export_df[col] = export_df[col].round(2)
        
        # Export to single file
        output_file = output_dir / 'player_values_complete.csv'
        export_df.to_csv(output_file, index=False, na_rep='')
        
        logger.info(f"Exported {len(export_df)} records to {output_file}")
        
        # Print status distribution
        print("\nStatus Distribution:")
        print(export_df.groupby(['Year', 'Status']).size().unstack(fill_value=0))
        
    except Exception as e:
        logger.error(f"Export process failed: {str(e)}")
        raise

try:
   
    # Execute export
    export_value_data(export_data, OUTPUT_DIR)
    
    # Print status distribution
    print("\nStatus Distribution:")
    print(export_data.groupby(['Year', 'Status']).size().unstack(fill_value=0))
    
except Exception as e:
    logger.error(f"Export process failed: {str(e)}")
    raise

2025-01-21 23:20:32,010 - INFO - Starting value data export


2025-01-21 23:20:32,591 - INFO - Exported 14872 records to c:\Users\User\Desktop\LSTMLB\data\generated\value_by_year\player_values_complete.csv



Status Distribution:
Status  Arb-1  Arb-1 (Super 2)  Arb-2  Arb-3  Arb-4  Free Agent  \
Year                                                              
2008        0                0      0      0      0           0   
2009        0                0      0      0      0           0   
2010        0                0      0      0      0           0   
2011        0                0      0      0      0           0   
2012        0                0      0      0      0           0   
2013        0                0      0      0      0           0   
2014        0                0      0      0      0           0   
2015        0                0      0      0      0           0   
2016        0                0      0      0      0           0   
2017        0                0      0      0      0           0   
2018        0                0      0      0      0           0   
2019        0                0      0      0      0           0   
2020        0                0      0   