In [39]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

In [40]:


def validate_files_exist(pattern, years):
    """Validate all required files exist"""
    missing_files = []
    for year in years:
        files = glob.glob(f'..\\data\\generated\\{pattern}_{year}.csv')
        if not files:
            missing_files.append(f"{pattern}_{year}.csv")
    if missing_files:
        raise FileNotFoundError(f"Missing files: {missing_files}")

def load_prediction_files(pattern, years=range(2025, 2040)):
    """Load and combine prediction CSVs"""
    validate_files_exist(pattern, years)
    dfs = []
    
    for year in years:
        file = glob.glob(f'..\\data\\generated\\{pattern}_{year}.csv')[0]
        df = pd.read_csv(file)
        
        # Validate required columns
        required_cols = ['Name', 'IDfg', 'Age', 'WAR']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in {file}: {missing_cols}")
            
        df['prediction_year'] = year
        dfs.append(df)
    
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# Main data loading
try:
    sp_data = load_prediction_files('SP_Predictions')
    rp_data = load_prediction_files('RP_Predictions')
    batter_data = load_prediction_files('Batter_Predictions')
    salary_data = pd.read_csv('..\\data\\SPORTRAC_MLB_SALARY_DATA.csv')
    
    print(f"Loaded {len(sp_data)} SP predictions")
    print(f"Loaded {len(rp_data)} RP predictions")
    print(f"Loaded {len(batter_data)} batter predictions")
    print(f"Loaded {len(salary_data)} salary records")
    
except Exception as e:
    print(f"Error loading data: {str(e)}")

Loaded 2400 SP predictions
Loaded 4230 RP predictions
Loaded 5520 batter predictions
Loaded 3821 salary records


Group Positions, and merge data

In [41]:
# Add position grouping
sp_data['position_group'] = 'SP'
rp_data['position_group'] = 'RP'
batter_data['position_group'] = 'POS'

# Combine all prediction data
player_predictions = pd.concat([
    sp_data[['Name', 'IDfg', 'position_group', 'Age', 'prediction_year', 'WAR']],
    rp_data[['Name', 'IDfg', 'position_group', 'Age', 'prediction_year', 'WAR']],
    batter_data[['Name', 'IDfg', 'position_group', 'Age', 'prediction_year', 'WAR']]
], ignore_index=True)

# Print summary stats
print("\nPlayer Predictions Summary:")
print(player_predictions.groupby(['position_group', 'prediction_year'])['WAR'].agg(['count', 'mean']))

# Display first few rows of salary data to see structure
print("\nSalary Data Sample:")
print(salary_data.head())


Player Predictions Summary:
                                count      mean
position_group prediction_year                 
POS            2025               367  1.176600
               2026               367  0.689211
               2027               367  0.136267
               2028               367 -0.433730
               2029               367 -0.932532
               2030               367 -1.366023
               2031               367 -1.719106
               2032               367 -1.991308
               2033               367 -2.199249
               2034               367 -2.357100
               2035               367 -2.474679
               2036               367 -2.558432
               2037               367 -2.614623
               2038               367 -2.649120
               2039               367 -2.666546
RP             2025               282  0.707092
               2026               282  0.583688
               2027               282  0.509929
           

In [42]:
# Check predictions for duplicate names in same year
duplicate_check = player_predictions.groupby(['Name', 'prediction_year']).size().reset_index(name='count')
duplicates = duplicate_check[duplicate_check['count'] > 1]

print("Players with same name in same year:")
print(duplicates)

# Check salary data for duplicate names in same year
salary_duplicates = salary_data.groupby(['Player Name', 'Year']).size().reset_index(name='count')
salary_dupes = salary_duplicates[salary_duplicates['count'] > 1]

print("\nSalary records with same name in same year:")
print(salary_dupes)

Players with same name in same year:
              Name  prediction_year  count
7365   Luis Garcia             2025      2
7366   Luis Garcia             2026      2
7367   Luis Garcia             2027      2
7368   Luis Garcia             2028      2
7369   Luis Garcia             2029      2
7370   Luis Garcia             2030      2
7371   Luis Garcia             2031      2
7372   Luis Garcia             2032      2
7373   Luis Garcia             2033      2
7374   Luis Garcia             2034      2
7375   Luis Garcia             2035      2
7376   Luis Garcia             2036      2
7377   Luis Garcia             2037      2
7378   Luis Garcia             2038      2
7379   Luis Garcia             2039      2
11640   Will Smith             2025      2
11641   Will Smith             2026      2
11642   Will Smith             2027      2
11643   Will Smith             2028      2
11644   Will Smith             2029      2
11645   Will Smith             2030      2
11646   Will Smit

In [43]:
def identify_duplicates():
    # Combine prediction data
    all_predictions = pd.concat([
        sp_data[['Name', 'IDfg', 'position_group', 'WAR', 'prediction_year']],
        rp_data[['Name', 'IDfg', 'position_group', 'WAR', 'prediction_year']],
        batter_data[['Name', 'IDfg', 'position_group', 'WAR', 'prediction_year']]
    ])
    
    # Find duplicates
    duplicates = (
        all_predictions.groupby(['Name', 'prediction_year'])
        .filter(lambda x: len(x) > 1)
        .sort_values(['Name', 'prediction_year'])
    )
    
    # Save duplicates for review
    duplicates.to_csv('../data/generated/duplicate_players.csv', index=False)
    
    print("\nDuplicate Players Found:")
    for name in duplicates['Name'].unique():
        print(f"\nName: {name}")
        print(duplicates[duplicates['Name'] == name][['IDfg', 'position_group', 'prediction_year', 'WAR']])
    
    return duplicates

duplicates = identify_duplicates()
# After manual review, create mapping
name_to_idfg_mapping = {
    ('Luis Garcia', 'hou'): 6984,  # Example mapping
    ('Luis Garcia', 'was'): 20391,
    ('Will Smith', 'atl'): 6984,
    ('Will Smith', 'lad'): 19197
}


Duplicate Players Found:

Name: Luis Garcia
       IDfg position_group  prediction_year       WAR
145    6984             RP             2025  0.600000
85    20391            POS             2025  1.991848
405    6984             RP             2026  0.600000
439   20391            POS             2026  1.794612
660    6984             RP             2027  0.500000
791   20391            POS             2027  1.473929
953    6984             RP             2028  0.400000
1151  20391            POS             2028  1.106073
1219   6984             RP             2029  0.400000
1508  20391            POS             2029  0.780648
1503   6984             RP             2030  0.400000
1868  20391            POS             2030  0.404145
1791   6984             RP             2031  0.400000
2227  20391            POS             2031 -0.028566
2077   6984             RP             2032  0.300000
2590  20391            POS             2032 -0.492576
2364   6984             RP           

In [44]:
def clean_salary_data(df):
    """Clean salary data removing special characters and handling percentages"""
    df = df.copy()
    
    # Remove non-player rows
    df = df[~df['Player Name'].str.contains('OPT-OUT|UFA', na=False)]
    
    # Convert Year to numeric, dropping non-year rows
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df = df.dropna(subset=['Year'])
    
    # Clean salary columns
    for col in ['Payroll', 'Luxury Tax']:
        # Handle percentage values
        mask_pct = df[col].str.contains('%', na=False)
        df.loc[mask_pct, col] = np.nan
        
        # Clean monetary values
        mask_money = ~mask_pct
        df.loc[mask_money, col] = (df.loc[mask_money, col]
            .replace(['-', ''], np.nan)
            .str.replace('$', '', regex=False)
            .str.replace(',', '', regex=False))
        
        # Convert to float
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Select and rename columns
    clean_df = df[['Player Name', 'Year', 'Team', 'Payroll']].copy()
    clean_df = clean_df.rename(columns={'Payroll': 'Salary'})
    
    return clean_df

# Test cleaning
salary_data_clean = clean_salary_data(salary_data)
print("\nSample of cleaned salary data:")
print(salary_data_clean.head())

# Print summary stats
print("\nCleaning summary:")
print(f"Original rows: {len(salary_data)}")
print(f"Cleaned rows: {len(salary_data_clean)}")
print(f"Rows with valid salary: {salary_data_clean['Salary'].notna().sum()}")


Sample of cleaned salary data:
      Player Name    Year Team      Salary
0  Corbin Carroll  2023.0  ari   1625000.0
1  Corbin Carroll  2024.0  ari   3625000.0
2  Corbin Carroll  2025.0  ari   5625000.0
3  Corbin Carroll  2026.0  ari  10625000.0
4  Corbin Carroll  2027.0  ari  12625000.0

Cleaning summary:
Original rows: 3821
Cleaned rows: 3806
Rows with valid salary: 2043


In [45]:
def create_player_reference():
    """Create reference table of players with IDfg"""
    player_ref = pd.concat([
        sp_data[['Name', 'IDfg', 'position_group']],
        rp_data[['Name', 'IDfg', 'position_group']],
        batter_data[['Name', 'IDfg', 'position_group']]
    ]).drop_duplicates()
    
    return player_ref

# Create reference
player_ref = create_player_reference()

# Join with salary data
salary_with_id = salary_data.merge(
    player_ref[['Name', 'IDfg']], 
    left_on='Player Name',
    right_on='Name',
    how='left'
)

print("Players missing IDfg:", salary_with_id['IDfg'].isna().sum())
print("\nSample of unmatched players:")
print(salary_with_id[salary_with_id['IDfg'].isna()]['Player Name'].unique()[:10])

Players missing IDfg: 1601

Sample of unmatched players:
['Kyle Nelson' 'Tim Tawa' 'Jordan Lawlar' 'Joe Elbis' 'Cristian Mena'
 'Jorge Barrosa' 'Yilber Diaz' 'Adrian Del Castillo' 'Tommy Henry'
 'Blake Walston']


In [None]:
def get_years_to_fa(status):
    """Calculate years until FA based on status"""
    if not status or pd.isna(status):
        return None
        
    status = status.upper()
    if 'ARB4' in status or 'ARB3' in status:
        return 1
    elif 'ARB2' in status:
        return 2
    elif 'ARB1' in status:
        return 3
    elif 'PRE-ARB' in status:
        return 4
    return None

def get_option_years(player_data):
    """Get all option years for player"""
    option_years = player_data[
        player_data['Status'].str.contains('OPT-OUT|PLAYER|CLUB|UFA', 
                                         na=False, 
                                         case=False)
    ].sort_values('Year')
    return option_years

def process_contract_status(salary_data):
    """Process all player contracts and return FA years"""
    fa_years = {}
    
    # Only process rows with valid IDfg
    valid_data = salary_data[salary_data['IDfg'].notna()].copy()
    valid_data['Year'] = pd.to_numeric(valid_data['Year'], errors='coerce')
    valid_data = valid_data.dropna(subset=['Year'])
    
    for idfg, player_data in valid_data.groupby('IDfg'):
        # Get latest year's status
        latest = player_data.sort_values('Year', ascending=False).iloc[0]
        current_year = latest['Year']
        status = str(latest['Status']).upper() if pd.notna(latest['Status']) else ''
        
        # Immediate FA (UFA, OPT-OUT, PLAYER)
        if any(x in status for x in ['UFA', 'OPT-OUT', 'PLAYER']):
            fa_years[idfg] = current_year
            continue
            
        # Next year FA (CLUB, VESTING, ARB 3, ARB 4)
        if any(x in status for x in ['CLUB', 'VESTING', 'ARB 3', 'ARB 4']):
            fa_years[idfg] = current_year + 1
            continue
            
        # ARB progression
        if 'ARB 2' in status:
            fa_years[idfg] = current_year + 2
        elif 'ARB 1' in status:
            fa_years[idfg] = current_year + 3
        elif 'PRE-ARB' in status:
            fa_years[idfg] = current_year + 4
    
    return fa_years

# Add FA years to salary data
fa_years = process_contract_status(salary_with_id)
salary_with_id['FA_Year'] = salary_with_id['IDfg'].map(fa_years)

# Print summary and check missing
print("\nFA Year Summary:")
print(f"Total players with FA year: {len(fa_years)}")
print(f"Players missing FA year: {salary_with_id['FA_Year'].isna().sum()}")
print("\nSample of players missing FA year:")
print(salary_with_id[salary_with_id['FA_Year'].isna()][['Player Name', 'Year', 'Status']].head())


FA Year Summary:
Total players with FA year: 555
Players missing FA year: 1609

Sample of players missing FA year:
    Player Name  Year    Status
65  Kyle Nelson  2025  Estimate
66  Kyle Nelson  2026     ARB 2
67  Kyle Nelson  2027     ARB 3
68  Kyle Nelson  2028       UFA
72     Tim Tawa  2025  Estimate
