In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Optional: For fetching data easily
# from pybaseball import playerid_lookup, statcast_pitcher, statcast_batter, pitching_stats, batting_stats
# Optional: For more advanced modeling
# from sklearn.linear_model import LinearRegression

# Display options for Pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Block 2: Data Acquisition using pybaseball (Complete Workaround Version)

import pandas as pd
import numpy as np
import datetime
import time # Import time for potential delays

# Make sure pybaseball is imported (assuming it was done in Block 1)
try:
    from pybaseball import batting_stats, pitching_stats
    print("pybaseball imported successfully.")
except ImportError:
    print("pybaseball not found. Please install it: pip install pybaseball")
    # Handle missing library... You might want to raise an error or exit
    # raise ImportError("pybaseball library is required. Please install it.")
    # For now, create placeholders to avoid immediate crashes, but subsequent blocks will fail
    batting_data = pd.DataFrame()
    pitching_data = pd.DataFrame()

# --- User Inputs for Data Fetching Parameters ---
print("\n--- Input Historical Data Parameters ---")
# Determine default years based on current date
# As of April 23, 2025, the last completed season is 2024.
today = datetime.date.today()
# Simple check: If it's before March, assume last year wasn't completed for stat purposes yet
last_completed_season = today.year - 1
default_end_year = last_completed_season
default_start_year = default_end_year - 2 # Default to 3 years of history

while True:
    try:
        start_year_input = input(f"Start Year for Historical Data [Default: {default_start_year}]: ").strip()
        # Use default if input is empty
        start_year = int(start_year_input) if start_year_input else default_start_year
        # Basic validation
        if start_year > default_end_year:
             print(f"Error: Start year ({start_year}) cannot be after last completed season ({default_end_year}).")
             continue
        if start_year < 1871: # MLB starting year approx
             print(f"Error: Start year ({start_year}) seems too early.")
             continue
        break # Exit loop if input is valid
    except ValueError:
        print("Invalid input. Please enter a year (e.g., 2022).")

while True:
    try:
        end_year_input = input(f"End Year for Historical Data [Default: {default_end_year}]: ").strip()
        # Use default if input is empty
        end_year = int(end_year_input) if end_year_input else default_end_year
        # Basic validation
        if end_year < start_year:
            print(f"Error: End year ({end_year}) cannot be before start year ({start_year}).")
            continue
        # Recommend using last completed season if user enters a future year
        if end_year > default_end_year:
             print(f"Warning: End year ({end_year}) is after last completed season ({default_end_year}).")
             print(f"         Fetching data up to {end_year}, but current season data might be incomplete/unstable.")
             # Or force it:
             # print(f"Adjusting end year to last completed season: {default_end_year}")
             # end_year = default_end_year
        break # Exit loop if input is valid
    except ValueError:
        print("Invalid input. Please enter a year (e.g., 2024).")

default_pa_qual = 100 # Default PA qualifier
while True:
    try:
        batter_qual_input = input(f"Minimum Plate Appearances (PA) for Batters (applied AFTER fetch) [Default: {default_pa_qual}]: ").strip()
        batter_qualifier = int(batter_qual_input) if batter_qual_input else default_pa_qual
        if batter_qualifier < 0: raise ValueError("Qualifier must be non-negative.") # Allow 0
        break
    except ValueError:
        print("Invalid input. Please enter a non-negative integer.")

default_ip_qual = 10.0 # Default IP qualifier (using 10.0 based on previous troubleshooting)
while True:
    try:
        pitcher_qual_input = input(f"Minimum Innings Pitched (IP) for Pitchers (applied AFTER fetch) [Default: {default_ip_qual}]: ").strip()
        pitcher_qualifier = float(pitcher_qual_input) if pitcher_qual_input else default_ip_qual
        if pitcher_qualifier < 0: raise ValueError("Qualifier must be non-negative.") # Allow 0
        break
    except ValueError:
        print("Invalid input. Please enter a non-negative number (e.g., 10.0 or 0).")

print("\n--- Data Fetching Parameters Confirmed ---")
print(f"Historical Data Range: {start_year}-{end_year}")
print(f"Qualifiers (applied post-fetch): {batter_qualifier} PA / {pitcher_qualifier:.1f} IP")
print("----------------------------------------")
# End of User Inputs for Block 2


# --- Fetch Batting Data Year-by-Year and Concatenate ---
all_batting_data_list = []
print(f"\nFetching FanGraphs Batting Stats year-by-year from {start_year} to {end_year}...")
# Check if pybaseball was imported successfully before trying to use it
if 'batting_stats' in locals() or 'batting_stats' in globals():
    for year in range(start_year, end_year + 1):
        print(f"Attempting to fetch batting data for {year}...")
        try:
            # Fetch with default qualifier (qual=None), NO verbose argument
            yearly_bat_data = batting_stats(year, qual=None, split_seasons=True)

            if yearly_bat_data is not None and not yearly_bat_data.empty:
                print(f"  -> SUCCESS: Fetched {len(yearly_bat_data)} rows for {year}.")
                # Rename IDfg if necessary
                if 'IDfg' in yearly_bat_data.columns and 'playerid' not in yearly_bat_data.columns:
                     yearly_bat_data = yearly_bat_data.rename(columns={'IDfg': 'playerid'})
                # Ensure Season column is present and correct type
                if 'Season' not in yearly_bat_data.columns:
                    yearly_bat_data['Season'] = year
                yearly_bat_data['Season'] = yearly_bat_data['Season'].astype(int)
                all_batting_data_list.append(yearly_bat_data)
            else:
                print(f"  -> No batting data found for {year} (empty DataFrame returned).")

        except Exception as e:
            print(f"  -> ERROR fetching batting data for {year}: {e}")

        # Optional: Add a small delay to be kind to FanGraphs servers
        time.sleep(2) # Wait 2 seconds between requests

    # Combine all years for batting
    print(f"\nCollected {len(all_batting_data_list)} yearly DataFrames for batting.")
    if all_batting_data_list:
        try:
            batting_data = pd.concat(all_batting_data_list, ignore_index=True)
            print(f"Successfully combined batting data. Shape before filtering: {batting_data.shape}")

            # Apply Qualifier Filter in Pandas
            if 'PA' in batting_data.columns:
                batting_data['PA'] = pd.to_numeric(batting_data['PA'], errors='coerce')
                original_rows = len(batting_data)
                print(f"Batting data rows before PA filter: {original_rows}")
                batting_data_filtered = batting_data[batting_data['PA'] >= batter_qualifier].copy()
                print(f"Batting data rows AFTER PA >= {batter_qualifier} filter: {len(batting_data_filtered)}")
                batting_data = batting_data_filtered
            else:
                print("Warning: 'PA' column not found in batting data, cannot apply PA qualifier filter.")

        except Exception as concat_err:
             print(f"ERROR during pd.concat or filtering for batting data: {concat_err}")
             batting_data = pd.DataFrame() # Ensure it's empty on error
    else:
        print("No yearly batting data DataFrames were collected.")
        batting_data = pd.DataFrame()

else:
    print("Skipping batting data fetch because pybaseball could not be imported.")
    batting_data = pd.DataFrame()


# --- Fetch Pitching Data Year-by-Year and Concatenate ---
all_pitching_data_list = []
print(f"\nFetching FanGraphs Pitching Stats year-by-year from {start_year} to {end_year}...")
# Check if pybaseball was imported successfully
if 'pitching_stats' in locals() or 'pitching_stats' in globals():
    for year in range(start_year, end_year + 1):
        print(f"Attempting to fetch pitching data for {year}...")
        try:
            # Fetch with default qualifier (qual=None), NO verbose argument
            yearly_pitch_data = pitching_stats(year, qual=None, split_seasons=True)

            if yearly_pitch_data is not None and not yearly_pitch_data.empty:
                print(f"  -> SUCCESS: Fetched {len(yearly_pitch_data)} rows for {year}.")
                # Rename IDfg if necessary
                if 'IDfg' in yearly_pitch_data.columns and 'playerid' not in yearly_pitch_data.columns:
                     yearly_pitch_data = yearly_pitch_data.rename(columns={'IDfg': 'playerid'})
                # Ensure Season column is present and correct type
                if 'Season' not in yearly_pitch_data.columns:
                    yearly_pitch_data['Season'] = year
                yearly_pitch_data['Season'] = yearly_pitch_data['Season'].astype(int)
                all_pitching_data_list.append(yearly_pitch_data)
            else:
                 print(f"  -> No pitching data found for {year} (empty DataFrame returned).")

        except Exception as e:
            # Print the specific error for the failing year
            print(f"  -> ERROR fetching pitching data for {year}: {e}")

        time.sleep(2) # Keep the delay

    # Combine all years for pitching
    print(f"\nCollected {len(all_pitching_data_list)} yearly DataFrames for pitching.")
    if all_pitching_data_list:
        # Debug: Check content before concat
        # for i, df in enumerate(all_pitching_data_list):
        #     print(f"  DataFrame {i} (Year {df['Season'].iloc[0]}): {df.shape[0]} rows")
        try:
            pitching_data = pd.concat(all_pitching_data_list, ignore_index=True)
            print(f"Successfully combined pitching data. Shape before filtering: {pitching_data.shape}")

            # Apply Qualifier Filter in Pandas & Debug
            if 'IP' in pitching_data.columns:
                # Convert IP, handling potential non-numeric values introduced before concat if any
                pitching_data['IP'] = pd.to_numeric(pitching_data['IP'], errors='coerce')
                pitching_data.dropna(subset=['IP'], inplace=True) # Drop rows where IP couldn't be converted

                original_rows = len(pitching_data)
                print(f"Pitching data rows before IP filter: {original_rows}")
                # Filter using the user-defined float qualifier
                pitching_data_filtered = pitching_data[pitching_data['IP'] >= pitcher_qualifier].copy()
                print(f"Pitching data rows AFTER IP >= {pitcher_qualifier:.1f} filter: {len(pitching_data_filtered)}")
                pitching_data = pitching_data_filtered # Assign back
            else:
                 print("Warning: 'IP' column not found in pitching data, cannot apply IP qualifier filter.")

        except Exception as concat_err:
            print(f"ERROR during pd.concat or filtering for pitching data: {concat_err}")
            pitching_data = pd.DataFrame() # Ensure it's empty on error

    else:
        print("\nNo yearly pitching data DataFrames were collected.")
        pitching_data = pd.DataFrame()

else:
    print("Skipping pitching data fetch because pybaseball could not be imported.")
    pitching_data = pd.DataFrame()


# --- Final Data Check and Basic Preparation ---
print("\n--- Data Fetching Block Complete ---")
data_fetched = False
if 'batting_data' in locals() and not batting_data.empty:
    print(f"Final Batting Data Shape: {batting_data.shape}")
    # Basic Cleaning (Ensure Age is numeric, playerid)
    if 'Age' in batting_data.columns:
        batting_data['Age'] = pd.to_numeric(batting_data['Age'], errors='coerce')
        batting_data.dropna(subset=['Age'], inplace=True)
        batting_data['Age'] = batting_data['Age'].astype(int)
    if 'playerid' in batting_data.columns:
         batting_data['playerid'] = pd.to_numeric(batting_data['playerid'], errors='ignore')
    data_fetched = True
else:
    print("Final Batting Data is empty.")
    batting_data = pd.DataFrame() # Ensure it exists as empty DF

if 'pitching_data' in locals() and not pitching_data.empty:
    print(f"Final Pitching Data Shape: {pitching_data.shape}")
     # Basic Cleaning (Ensure Age is numeric, playerid)
    if 'Age' in pitching_data.columns:
        pitching_data['Age'] = pd.to_numeric(pitching_data['Age'], errors='coerce')
        pitching_data.dropna(subset=['Age'], inplace=True)
        pitching_data['Age'] = pitching_data['Age'].astype(int)
    if 'playerid' in pitching_data.columns:
         pitching_data['playerid'] = pd.to_numeric(pitching_data['playerid'], errors='ignore')
    data_fetched = True
else:
    print("Final Pitching Data is empty.")
    pitching_data = pd.DataFrame() # Ensure it exists as empty DF

if not data_fetched:
    print("\nWarning: Both batting and pitching data are empty after fetch/filter process. Subsequent blocks will likely fail.")
else:
    print("\nBasic data cleaning complete.")
print("----------------------------------")

# DataFrames 'batting_data' and 'pitching_data' are now ready for Block 3

pybaseball imported successfully.

--- Input Historical Data Parameters ---


Start Year for Historical Data [Default: 2022]:  2018
End Year for Historical Data [Default: 2024]:  2024
Minimum Plate Appearances (PA) for Batters (applied AFTER fetch) [Default: 100]:  300
Minimum Innings Pitched (IP) for Pitchers (applied AFTER fetch) [Default: 10.0]:  10



--- Data Fetching Parameters Confirmed ---
Historical Data Range: 2018-2024
Qualifiers (applied post-fetch): 300 PA / 10.0 IP
----------------------------------------

Fetching FanGraphs Batting Stats year-by-year from 2018 to 2024...
Attempting to fetch batting data for 2018...
  -> SUCCESS: Fetched 141 rows for 2018.
Attempting to fetch batting data for 2019...
  -> SUCCESS: Fetched 135 rows for 2019.
Attempting to fetch batting data for 2020...
  -> SUCCESS: Fetched 142 rows for 2020.
Attempting to fetch batting data for 2021...
  -> SUCCESS: Fetched 132 rows for 2021.
Attempting to fetch batting data for 2022...
  -> SUCCESS: Fetched 130 rows for 2022.
Attempting to fetch batting data for 2023...
  -> SUCCESS: Fetched 134 rows for 2023.
Attempting to fetch batting data for 2024...
  -> SUCCESS: Fetched 129 rows for 2024.

Collected 7 yearly DataFrames for batting.
Successfully combined batting data. Shape before filtering: (943, 320)
Batting data rows before PA filter: 943
Batting

In [48]:
# Block 1a: Player Selection (Interactive)

import pandas as pd
from pybaseball import playerid_lookup

# Ensure pandas displays the table nicely
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


target_player_id = None # Initialize variable

while target_player_id is None: # Loop until a valid player ID is selected
    print("\nEnter player name to project:")
    # Strip whitespace from inputs
    first_name = input("First Name: ").strip()
    last_name = input("Last Name: ").strip()

    if not first_name or not last_name:
        print("Error: Both first and last name are required.")
        continue # Ask for name again

    try:
        print(f"\nLooking up player ID for '{first_name} {last_name}'...")
        # Use fuzzy=True for potentially better matching with slight misspellings
        player_ids = playerid_lookup(last_name, first_name, fuzzy=True)

        # Filter out players who haven't played in MLB (optional, but cleans up list)
        # FanGraphs data usually only exists for players with MLB time
        player_ids = player_ids[player_ids['mlb_played_last'].notna()].copy()


        if player_ids.empty:
            print(f"Error: No MLB player found matching '{first_name} {last_name}'. Please check spelling and try again.")
            continue # Ask for name again

        elif len(player_ids) == 1:
            # Only one match found
            player_row = player_ids.iloc[0]
            target_player_id = int(player_row['key_fangraphs']) # FanGraphs ID
            print(f"\nFound Player: {player_row['name_first']} {player_row['name_last']} "
                  f"(Played: {int(player_row['mlb_played_first'] or 0)}-{int(player_row['mlb_played_last'] or 0)})")
            print(f"Using FanGraphs ID: {target_player_id}")
            # Optional confirmation step:
            # confirm = input("Is this the correct player? (y/n): ").lower()
            # if confirm != 'y':
            #    target_player_id = None # Reset to loop and ask again
            #    print("Okay, let's try again.")
            #    continue

        else:
            # Multiple players found - display table and ask for selection
            print("\nMultiple players found. Please select the correct one by entering their FanGraphs ID:")

            # Select and rename columns for display
            display_cols = ['name_last', 'name_first', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
            player_ids_display = player_ids[display_cols].copy()
            player_ids_display.rename(columns={
                'name_last': 'Last Name',
                'name_first': 'First Name',
                'key_fangraphs': 'FanGraphsID',
                'mlb_played_first': 'First MLB Yr',
                'mlb_played_last': 'Last MLB Yr'
                }, inplace=True)

            # Fill NaN years for display clarity
            player_ids_display[['First MLB Yr', 'Last MLB Yr']] = player_ids_display[['First MLB Yr', 'Last MLB Yr']].fillna(0).astype(int)

            # Print the table without the pandas index
            print(player_ids_display.to_string(index=False))

            # Loop to get valid ID selection from user
            while True:
                try:
                    chosen_id_input = input(f"\nEnter the FanGraphsID for the correct '{first_name} {last_name}': ").strip()
                    chosen_id = int(chosen_id_input)

                    # Verify the chosen ID is in the results list
                    if chosen_id in player_ids['key_fangraphs'].values:
                        target_player_id = chosen_id
                        selected_player_name = player_ids[player_ids['key_fangraphs'] == chosen_id][['name_first', 'name_last']].iloc[0]
                        print(f"Selected: {selected_player_name['name_first']} {selected_player_name['name_last']} (FanGraphs ID: {target_player_id})")
                        break # Valid ID chosen, exit inner loop
                    else:
                        print("Error: Entered ID not found in the list above. Please try again.")

                except ValueError:
                    print("Invalid input. Please enter the numeric FanGraphs ID from the table.")
                except KeyboardInterrupt:
                     print("\nSelection cancelled.")
                     # Decide if you want to exit or retry outer loop here
                     target_player_id = None # Ensure outer loop continues if needed
                     break # Exit inner loop on Ctrl+C

            if target_player_id is None: # If selection was cancelled
                continue # Go back to asking for name


    except Exception as e:
        print(f"An error occurred during player lookup: {e}")
        print("This might be a network issue or a problem with pybaseball/lookup service.")
        print("Please try again later or check your connection.")
        # Depending on the error, you might want to stop execution or retry

# --- End of Player Selection ---
if target_player_id:
    print(f"\nProceeding with analysis for Player ID: {target_player_id}")
else:
    print("\nNo player selected. Cannot proceed.")
    # Handle this case - maybe raise an error or exit
    # raise ValueError("Player selection failed.")


Enter player name to project:


First Name:  steven kwan
Last Name:  kwan



Looking up player ID for 'steven kwan kwan'...
No identically matched names found! Returning the 5 most similar names.

Multiple players found. Please select the correct one by entering their FanGraphs ID:
An error occurred during player lookup: invalid literal for int() with base 10: ''
This might be a network issue or a problem with pybaseball/lookup service.
Please try again later or check your connection.

Enter player name to project:


First Name:  steven
Last Name:  kwan



Looking up player ID for 'steven kwan'...

Found Player: steven kwan (Played: 2022-2025)
Using FanGraphs ID: 24610

Proceeding with analysis for Player ID: 24610


In [52]:
# Block 1b: Get Player Arbitration Status

# Ensure target_player_id was set in Block 1a
if 'target_player_id' not in locals() or target_player_id is None:
    print("Cannot determine player status because target_player_id is not set. Please run Block 1a first.")
    # Handle error - maybe raise one or set a default that will clearly fail later
    player_start_status = "Unknown"
else:
    print(f"\n--- Specify Player Status for First Projection Year ---")
    print(f"What is the status of Player ID {target_player_id} for the upcoming season (the first year of your projection)?")

    valid_statuses = {
        '1': 'Pre-Arb', # Not yet arbitration eligible
        '2': 'Arb 1',   # First year of arbitration
        '3': 'Arb 2',   # Second year of arbitration
        '4': 'Arb 3',   # Third year of arbitration
        '5': 'Arb 4 (S2)',# Fourth year of arbitration (Super Two only)
        '6': 'FA'       # Free Agent
    }

    print("Options:")
    for key, value in valid_statuses.items():
        print(f"  {key}: {value}")

    while True:
        status_choice = input("Enter the number corresponding to the player's status: ").strip()
        if status_choice in valid_statuses:
            player_start_status = valid_statuses[status_choice]
            print(f"Selected starting status: {player_start_status}")
            break
        else:
            print("Invalid choice. Please enter a number from the list.")

# This variable 'player_start_status' will be used in Block 6


--- Specify Player Status for First Projection Year ---
What is the status of Player ID 24610 for the upcoming season (the first year of your projection)?
Options:
  1: Pre-Arb
  2: Arb 1
  3: Arb 2
  4: Arb 3
  5: Arb 4 (S2)
  6: FA


Enter the number corresponding to the player's status:  1


Selected starting status: Pre-Arb


In [54]:
# Block 1c: Get Market Rate ($/WAR) Inputs

print("\n--- Input Market Rate Assumptions ---")
print("Enter the estimated market rate ($ Millions per 1 WAR) for free agents.")
print("(Based on recent trends, pitcher WAR often commands a premium)")

# Set defaults based on recent analysis (e.g., ~$8.5M for hitters, ~$9.5M-$10M for pitchers)
default_hitter_rate = 8.5
default_pitcher_rate = 9.5 # Adjust this based on latest market observations

while True:
    try:
        hitter_rate_input = input(f"Market Rate ($M/WAR) for HITTERS [Default: {default_hitter_rate}]: ").strip()
        market_rate_hitter = float(hitter_rate_input) if hitter_rate_input else default_hitter_rate
        if market_rate_hitter <= 0: raise ValueError("Rate must be positive.")
        break
    except ValueError:
        print("Invalid input. Please enter a positive number (e.g., 8.5).")

while True:
    try:
        pitcher_rate_input = input(f"Market Rate ($M/WAR) for PITCHERS [Default: {default_pitcher_rate}]: ").strip()
        market_rate_pitcher = float(pitcher_rate_input) if pitcher_rate_input else default_pitcher_rate
        if market_rate_pitcher <= 0: raise ValueError("Rate must be positive.")
        break
    except ValueError:
        print("Invalid input. Please enter a positive number (e.g., 9.5).")

print("\n--- Market Rates Confirmed ---")
print(f"Hitter Market Rate: ${market_rate_hitter:.1f} M / WAR")
print(f"Pitcher Market Rate: ${market_rate_pitcher:.1f} M / WAR")
print("----------------------------")

# These variables 'market_rate_hitter' and 'market_rate_pitcher' will be used in Block 6


--- Input Market Rate Assumptions ---
Enter the estimated market rate ($ Millions per 1 WAR) for free agents.
(Based on recent trends, pitcher WAR often commands a premium)


Market Rate ($M/WAR) for HITTERS [Default: 8.5]:  8.5
Market Rate ($M/WAR) for PITCHERS [Default: 9.5]:  9.5



--- Market Rates Confirmed ---
Hitter Market Rate: $8.5 M / WAR
Pitcher Market Rate: $9.5 M / WAR
----------------------------


In [56]:
# Block 3: Establish Baseline Performance

# --- Function Definition (keep calculate_baseline as is) ---
def calculate_baseline(player_stats, stat_cols, weight_recent=0.5, weight_middle=0.3, weight_old=0.2):
    """Calculates weighted baseline stats for a player."""
    # ...(function code remains the same)...
    player_stats = player_stats.sort_values(by='Season', ascending=False)
    baseline = {}
    total_pa_ip = 0 # Use PA for hitters, IP for pitchers

    # Determine if hitter (PA) or pitcher (IP)
    pa_ip_col = 'PA' if 'PA' in player_stats.columns else 'IP'

    if len(player_stats) == 0:
        return None # No historical data

    # --- Add check for required columns early ---
    missing_cols = [col for col in stat_cols if col not in player_stats.columns]
    if missing_cols:
         print(f"Error inside calculate_baseline: Missing columns in input data: {missing_cols}")
         # Decide how to handle: return None, raise error, etc.
         return None
    if pa_ip_col not in player_stats.columns:
        print(f"Error inside calculate_baseline: Missing '{pa_ip_col}' column.")
        return None
    if 'Name' not in player_stats.columns or 'Age' not in player_stats.columns or 'playerid' not in player_stats.columns:
         print("Error inside calculate_baseline: Missing Name/Age/playerid columns.")
         return None
    # --- End check ---


    if len(player_stats) == 1:
        weights = {0: 1.0}
        total_pa_ip = player_stats.iloc[0][pa_ip_col]
    elif len(player_stats) == 2:
        weights = {0: 0.6, 1: 0.4}
        total_pa_ip = player_stats.iloc[0][pa_ip_col] * 0.6 + player_stats.iloc[1][pa_ip_col] * 0.4
    else:
        weights = {0: weight_recent, 1: weight_middle, 2: weight_old}
        total_pa_ip = (player_stats.iloc[0][pa_ip_col] * weight_recent +
                       player_stats.iloc[1][pa_ip_col] * weight_middle +
                       player_stats.iloc[2][pa_ip_col] * weight_old)

    for stat in stat_cols:
        weighted_sum = 0
        total_weight = 0
        for i, weight in weights.items():
             if i < len(player_stats):
                 # Check for non-numeric data before multiplying
                 stat_value = player_stats.iloc[i][stat]
                 if pd.api.types.is_numeric_dtype(stat_value):
                     weighted_sum += stat_value * weight
                     total_weight += weight
                 else:
                      # Handle non-numeric data - skip, log, etc.
                      # print(f"Warning: Non-numeric value '{stat_value}' found for stat '{stat}' in row {i}. Skipping.")
                      pass # Simple skip for now
        baseline[stat] = weighted_sum / total_weight if total_weight > 0 else 0

    baseline[f'baseline_{pa_ip_col}'] = total_pa_ip
    baseline['Name'] = player_stats.iloc[0]['Name']
    baseline['last_season_age'] = player_stats.iloc[0]['Age']
    baseline['playerid'] = player_stats.iloc[0]['playerid']

    return baseline


# --- Example Usage (Modified Logic) ---

# Make sure target_player_id is defined from Block 1a (Player Selection)
if 'target_player_id' not in locals() and 'target_player_id' not in globals():
     print("Error: target_player_id is not defined. Please run Block 1a (Player Selection) first.")
     player_baseline = None # Prevent further errors
else:
     print(f"Attempting baseline calculation for Player ID: {target_player_id}")

     # --- Add Debugging: Print Columns & Shapes ---
     print("\n--- Checking DataFrames before baseline calculation ---")
     if 'batting_data' in locals() and isinstance(batting_data, pd.DataFrame):
         print(f"Batting Data Shape: {batting_data.shape}")
     else:
         print("Batting Data variable does not exist or is not a DataFrame.")
         batting_data = pd.DataFrame() # Ensure it exists as empty DF

     # --- End Debugging ---

     # Define stats needed for baseline (use actual columns from your data)
     # Ensure these columns exist in the fetched DataFrames! Check the printout above.
     hitter_baseline_stats = ['wRC+', 'WAR', 'PA', 'Age', 'Season', 'Name', 'playerid'] # Added essential cols for function
     pitcher_baseline_stats = ['FIP-', 'WAR', 'IP', 'Age', 'Season', 'Name', 'playerid'] # Added essential cols for function

     # Default to None
     player_baseline = None

     # --- Modified Check: Check DataFrame not empty AND contains playerid column BEFORE filtering ---
     if not batting_data.empty and 'playerid' in batting_data.columns and target_player_id in batting_data['playerid'].unique():
         player_history = batting_data[batting_data['playerid'] == target_player_id].copy()
         print(f"Found {len(player_history)} season(s) for player {target_player_id} in batting data.")
         # Verify required columns exist *in player_history*
         missing_cols = [col for col in hitter_baseline_stats if col not in player_history.columns]
         if missing_cols:
              print(f"Error: Missing required baseline columns in player_history (batting): {missing_cols}")
         else:
              player_baseline = calculate_baseline(player_history, hitter_baseline_stats)
              if player_baseline:
                   print("\nCalculated Hitter Baseline:")
                   print(player_baseline)
              else:
                   print("\nBaseline calculation failed for hitter (check function errors).")


     elif not pitching_data.empty and 'playerid' in pitching_data.columns and target_player_id in pitching_data['playerid'].unique():
         player_history_pitching = pitching_data[pitching_data['playerid'] == target_player_id].copy()
         print(f"Found {len(player_history_pitching)} season(s) for player {target_player_id} in pitching data.")
         # Verify required columns exist *in player_history_pitching*
         missing_cols = [col for col in pitcher_baseline_stats if col not in player_history_pitching.columns]
         if missing_cols:
              print(f"Error: Missing required baseline columns in player_history_pitching: {missing_cols}")
         else:
              player_baseline = calculate_baseline(player_history_pitching, pitcher_baseline_stats)
              if player_baseline:
                  print("\nCalculated Pitcher Baseline:")
                  print(player_baseline)
              else:
                  print("\nBaseline calculation failed for pitcher (check function errors).")

     # --- This else block now executes if the player ID is not found in EITHER non-empty dataframe ---
     else:
         print(f"\nPlayer ID {target_player_id} not found in non-empty batting or pitching data after filtering in Block 2.")
         print("Possible reasons: Player didn't meet qualifiers, data unavailable, ID issue, or DataFrames are empty.")
         # player_baseline remains None

# --- ADD THIS AT THE END of the "Example Usage" section in Block 3 ---
# (Should be after the main if/elif/else that calls calculate_baseline)

# Determine if player is hitter or pitcher based on successful baseline calculation
if 'player_baseline' in locals() and player_baseline is not None:
    # Check if baseline includes PA key (hitter) or IP key (pitcher)
    if 'baseline_PA' in player_baseline:
        is_hitter_flag = True
        print(f"\nPlayer identified as: Hitter") # Added newline for clarity
    elif 'baseline_IP' in player_baseline:
        is_hitter_flag = False
        print(f"\nPlayer identified as: Pitcher") # Added newline for clarity
    else:
        is_hitter_flag = None # Should not happen if baseline calculated correctly
        print("\nWarning: Could not determine player type (Hitter/Pitcher) from baseline keys.")
else:
    is_hitter_flag = None # Flag is unknown if baseline failed
    print("\nCould not determine player type because baseline calculation failed or player_baseline is None.")

# --- End of addition to Block 3 ---

# Check final result
if player_baseline is None:
     print("\nPlayer baseline could not be calculated.")

Attempting baseline calculation for Player ID: 24610

--- Checking DataFrames before baseline calculation ---
Batting Data Shape: (801, 320)
Found 3 season(s) for player 24610 in batting data.

Calculated Hitter Baseline:
{'wRC+': 120.2, 'WAR': 3.9499999999999997, 'PA': 613.0, 'Age': 25.3, 'Season': 2023.3000000000002, 'Name': 'Steven Kwan', 'playerid': 24610, 'baseline_PA': 613.0, 'last_season_age': 26}

Player identified as: Hitter


In [58]:
# Block 5: Project Future Seasons (Using FanGraphs Avg WAR Curve)

import pandas as pd
import numpy as np # Ensure numpy is imported

# --- Define Aging Curve Coefficients (from FanGraphs Article) ---
beta_0 = -15.46
beta_1 = 1.20
beta_2 = -0.02

# --- Define Standard Full Season Playing Time (for scaling WAR rate) ---
# These are typical values, can be adjusted
FULL_SEASON_PA = 600
FULL_SEASON_IP = 180 # Example for SP, might differ for RP heavy profile

# --- Keep Existing Helper Functions & Simple Curves ---
# (Keep get_aging_factor, hitter_aging_curve, pitcher_aging_curve_fip, playing_time_aging defined as before)
# We'll still project wRC+/FIP- with the simple curves for display purposes,
# but the WAR calculation will use the new formula.

def get_aging_factor(curve, age):
    """Safely gets aging factor, using closest available if age is out of range."""
    if age in curve:
        return curve[age]
    elif age < min(curve.keys()):
        return curve[min(curve.keys())]
    else:
        return curve[max(curve.keys())]

# Generic Hitter Aging (e.g., for wRC+) - Keep for display
hitter_aging_curve = { 20: 1.02, 21: 1.03, 22: 1.04, 23: 1.04, 24: 1.03, 25: 1.02, 26: 1.01, 27: 1.00, 28: 1.00, 29: 0.99, 30: 0.98, 31: 0.96, 32: 0.94, 33: 0.92, 34: 0.89, 35: 0.86, 36: 0.83, 37: 0.80, 38: 0.77, 39: 0.74, 40: 0.70 }
# Generic Pitcher Aging (e.g., for FIP-) - Keep for display
pitcher_aging_curve_fip = { 20: 0.98, 21: 0.97, 22: 0.96, 23: 0.97, 24: 0.98, 25: 0.99, 26: 1.00, 27: 1.00, 28: 1.01, 29: 1.02, 30: 1.03, 31: 1.05, 32: 1.07, 33: 1.09, 34: 1.12, 35: 1.15, 36: 1.18, 37: 1.21, 38: 1.24, 39: 1.27, 40: 1.30 }
# Generic Playing Time Aging
playing_time_aging = { age: 1.00 for age in range(20, 29) }
playing_time_aging.update({ 29: 0.99, 30: 0.98, 31: 0.97, 32: 0.96, 33: 0.94, 34: 0.92, 35: 0.90, 36: 0.87, 37: 0.84, 38: 0.80, 39: 0.76, 40: 0.72 })


# --- Updated project_player function ---
def project_player(baseline, num_years, is_hitter=True):
    """Projects player performance using FanGraphs AVG WAR curve and simple PT/Rate aging."""
    projections = []
    if baseline is None: # Add check for None baseline
        print("Error: Baseline data is None, cannot project.")
        return pd.DataFrame(projections)

    last_age = baseline.get('last_season_age') # Use .get for safety
    if last_age is None:
        print("Error: 'last_season_age' not found in baseline.")
        return pd.DataFrame(projections)

    # Determine player type and baseline stats/playing time
    if is_hitter:
        pa_ip_col = 'PA'
        baseline_rate_stat = 'wRC+'
        rate_aging_curve = hitter_aging_curve
        full_season_denominator = FULL_SEASON_PA
    else:
        pa_ip_col = 'IP'
        baseline_rate_stat = 'FIP-'
        rate_aging_curve = pitcher_aging_curve_fip
        full_season_denominator = FULL_SEASON_IP

    # Get baseline values safely using .get() with defaults
    baseline_rate_value = baseline.get(baseline_rate_stat, 100 if is_hitter else 100) # Default to average
    projected_pa_ip = baseline.get(f'baseline_{pa_ip_col}', 0) # Get baseline playing time

    if projected_pa_ip <= 0:
        print(f"Warning: Baseline {pa_ip_col} is {projected_pa_ip}. WAR projections will be zero.")
        # Avoid division by zero later if baseline PT is zero
        projected_pa_ip = 0


    print(f"Projecting {num_years} years forward from Age {last_age}...")

    for i in range(1, num_years + 1):
        current_age = last_age + i
        proj = {'Year': last_age + i, 'Age': current_age} # Year relative to last data year? Or calendar year? Revisit if needed.

        # 1. Project Rate Stat (wRC+ or FIP-) using SIMPLE aging curve (for display only)
        age_factor_rate = get_aging_factor(rate_aging_curve, current_age)
        proj[f'proj_{baseline_rate_stat}'] = baseline_rate_value * age_factor_rate
        if not is_hitter: # Floor for FIP-
            proj[f'proj_{baseline_rate_stat}'] = max(50, proj[f'proj_{baseline_rate_stat}'])

        # 2. Project Playing Time using SIMPLE aging curve
        age_factor_pt = get_aging_factor(playing_time_aging, current_age)
        # Apply cumulatively
        projected_pa_ip *= age_factor_pt
        # Ensure playing time doesn't go below zero
        projected_pa_ip = max(0, projected_pa_ip)
        proj[f'proj_{pa_ip_col}'] = round(projected_pa_ip, 1)

        # --- 3. Project WAR using FanGraphs Average Curve Formula ---
        # Calculate the average WAR rate for this age
        avg_war_at_age = beta_0 + (beta_1 * current_age) + (beta_2 * (current_age**2))

        # Scale the average WAR rate by projected playing time relative to a full season
        # Avoid division by zero if denominator is 0
        if full_season_denominator > 0:
            playing_time_ratio = proj[f'proj_{pa_ip_col}'] / full_season_denominator
        else:
            playing_time_ratio = 0

        projected_war = avg_war_at_age * playing_time_ratio
        # We don't floor WAR at 0, as players can have negative WAR
        proj['proj_WAR'] = round(projected_war, 1)
        # --- End New WAR Calculation ---

        projections.append(proj)

    return pd.DataFrame(projections)


# --- Example Usage (as before, but ensure `projection_years` is defined) ---
# Make sure projection_years is defined (e.g., from user input or set default)
if 'projection_years' not in locals():
    print("Projection years not set, using default of 6.")
    projection_years = 6

if 'player_baseline' in locals() and player_baseline is not None:
     # Ensure is_hitter_flag was set correctly in Block 3
    if 'is_hitter_flag' in locals() and is_hitter_flag is not None:
        player_projections = project_player(player_baseline, projection_years, is_hitter=is_hitter_flag)
        if not player_projections.empty:
             print(f"\n{projection_years}-Year Projection for {player_baseline.get('Name', 'Player')}:")
             print(player_projections)
        else:
             print("\nProjection could not be generated.")
             player_projections = pd.DataFrame() # Ensure it's an empty DF
    else:
        print("Cannot create projection without knowing if player is hitter or pitcher (is_hitter_flag missing).")
        player_projections = pd.DataFrame()
else:
    print("Cannot create projection without baseline.")
    player_projections = pd.DataFrame() # Ensure it's an empty DF

Projecting 15 years forward from Age 26...

15-Year Projection for Steven Kwan:
    Year  Age  proj_wRC+  proj_PA  proj_WAR
0     27   27    120.200    613.0       2.4
1     28   28    120.200    613.0       2.5
2     29   29    118.998    606.9       2.5
3     30   30    117.796    594.7       2.5
4     31   31    115.392    576.9       2.4
..   ...  ...        ...      ...       ...
10    37   37     96.160    315.0       0.8
11    38   38     92.554    252.0       0.5
12    39   39     88.948    191.5       0.3
13    40   40     84.140    137.9       0.1
14    41   41     84.140     99.3       0.0

[15 rows x 5 columns]


In [60]:
def project_player(baseline, num_years, is_hitter=True):
    """Projects player performance for future years."""
    projections = []
    last_age = baseline['last_season_age']

    # Use appropriate baseline stat (wRC+ or FIP-) and aging curve
    baseline_rate_stat = 'wRC+' if is_hitter else 'FIP-'
    rate_aging_curve = hitter_aging_curve if is_hitter else pitcher_aging_curve_fip

    # Use baseline PA or IP for playing time projection start
    pa_ip_col = 'PA' if is_hitter else 'IP'
    projected_pa_ip = baseline[f'baseline_{pa_ip_col}'] # Start with baseline average

    # Use baseline WAR for projecting future WAR (simplistic approach)
    # A better approach projects components (Batting, Field, BaseRun) then calculates WAR
    baseline_war_rate = baseline['WAR'] / baseline[f'baseline_{pa_ip_col}'] if baseline[f'baseline_{pa_ip_col}'] > 0 else 0

    for i in range(1, num_years + 1):
        current_age = last_age + i
        proj = {'Year': baseline['last_season_age'] + i, 'Age': current_age} # Year relative to last data year

        # 1. Project Rate Stat (wRC+ or FIP-)
        age_factor_rate = get_aging_factor(rate_aging_curve, current_age)
        proj[f'proj_{baseline_rate_stat}'] = baseline[baseline_rate_stat] * age_factor_rate
        # Ensure FIP- doesn't go below a reasonable floor (e.g., 50)
        if not is_hitter:
            proj[f'proj_{baseline_rate_stat}'] = max(50, proj[f'proj_{baseline_rate_stat}'])

        # 2. Project Playing Time
        age_factor_pt = get_aging_factor(playing_time_aging, current_age)
        # Apply aging factor cumulatively or just to baseline estimate
        # Simple: Apply to previous year's projection
        projected_pa_ip *= age_factor_pt
        proj[f'proj_{pa_ip_col}'] = round(projected_pa_ip, 1)

        # 3. Project WAR (Simplified Example: Scale baseline WAR/PA or WAR/IP by aging * projected PT)
        # THIS IS A MAJOR SIMPLIFICATION - ideally you calculate WAR from projected components
        # Apply a generic aging factor to the WAR rate itself (similar to rate stat aging)
        generic_war_age_factor = get_aging_factor(hitter_aging_curve, current_age) # Assume similar aging for WAR
        projected_war = (baseline_war_rate * generic_war_age_factor) * proj[f'proj_{pa_ip_col}']
        proj['proj_WAR'] = round(projected_war, 1)

        projections.append(proj)

    return pd.DataFrame(projections)

# --- Example Usage ---
projection_years = 8 # Project 6 years of team control/contract
if player_baseline:
    is_hitter_flag = 'wRC+' in player_baseline # Check if baseline contains hitter stat
    player_projections = project_player(player_baseline, projection_years, is_hitter=is_hitter_flag)
    print(f"\n{projection_years}-Year Projection for {player_baseline['Name']}:")
    print(player_projections)
else:
    print("Cannot create projection without baseline.")
    player_projections = pd.DataFrame()


8-Year Projection for Steven Kwan:
   Year  Age  proj_wRC+  proj_PA  proj_WAR
0    27   27    120.200    613.0       4.0
1    28   28    120.200    613.0       4.0
2    29   29    118.998    606.9       3.9
3    30   30    117.796    594.7       3.8
4    31   31    115.392    576.9       3.6
5    32   32    112.988    553.8       3.4
6    33   33    110.584    520.6       3.1
7    34   34    106.978    478.9       2.7


In [64]:
# Block 6: Financial Analysis (Applying Average Flat Salary for FA Years)

import numpy as np
import pandas as pd
import re # Make sure re is imported

print("\n--- Entering Block 6 ---")

# --- Variable Check ---
# (Keep the same 'Simplified Variable Check' from the previous version)
required_vars = ['player_projections', 'player_baseline', 'player_start_status',
                 'market_rate_hitter', 'market_rate_pitcher',
                 'target_player_id', 'is_hitter_flag']
proceed = True
missing_or_invalid = []
print("--- Running Simplified Variable Check ---")
for var_name in required_vars:
    # ... (Variable check logic remains the same) ...
    print(f"Checking variable: '{var_name}'...")
    var_value = globals().get(var_name)
    if var_value is None and var_name != 'is_hitter_flag': proceed = False; missing_or_invalid.append(f"{var_name} (Missing/None)"); print(f"  -> FAIL: Missing/None")
    elif var_name == 'is_hitter_flag' and var_value is None: proceed = False; missing_or_invalid.append(f"{var_name} (Missing/None)"); print(f"  -> FAIL: Missing/None")
    elif isinstance(var_value, pd.DataFrame) and var_value.empty: proceed = False; missing_or_invalid.append(f"{var_name} (Empty DataFrame)"); print(f"  -> FAIL: Empty DataFrame")
    elif isinstance(var_value, dict) and not var_value: proceed = False; missing_or_invalid.append(f"{var_name} (Empty Dict)"); print(f"  -> FAIL: Empty Dict")
    else: print(f"  -> OK: Variable '{var_name}' exists and seems valid.")
print("---------------------------------------")

if not proceed:
    print(f"\nSkipping financial analysis due to issues with prerequisite variables: {', '.join(missing_or_invalid)}")
    if 'player_projections' not in globals() or not isinstance(globals().get('player_projections'), pd.DataFrame): player_projections = pd.DataFrame()
else:
    if player_projections.empty or not isinstance(player_baseline, dict) or not player_baseline:
         print("\nSkipping financial analysis due to empty projections or missing/empty baseline data.")
    else:
         print("\n--- Calculating Financial Summary with Projected Salaries (Flat FA Years) ---")
         # --- Define Arbitration Salary Logic & Minimum ---
         arb_pct_yr1 = 0.25; arb_pct_yr2 = 0.40; arb_pct_yr3 = 0.60; arb_pct_yr4 = 0.90
         arb_percentages = {1: arb_pct_yr1, 2: arb_pct_yr2, 3: arb_pct_yr3, 4: arb_pct_yr4}
         mlb_minimum = 0.78 # Approx 2025 minimum in $M

         # --- Determine Position-Specific Market Rate ---
         current_market_rate = market_rate_hitter if is_hitter_flag else market_rate_pitcher
         print(f"Using {'Hitter' if is_hitter_flag else 'Pitcher'} Market Rate: ${current_market_rate:.1f}M / WAR")

         # --- Determine FA Years and Calculate Average FA Value ---
         fa_year_indices = []
         temp_status = player_start_status
         total_fa_proj_war = 0
         num_fa_years = 0

         for index in player_projections.index:
             is_fa = False
             if temp_status == "FA":
                 is_fa = True
             elif temp_status.startswith("Arb"):
                 try:
                     arb_year_str = temp_status.split()[-1].replace('(S2)','')
                     arb_year = int(arb_year_str)
                     max_arb_years = 4 if "(S2)" in temp_status else 3
                     if arb_year < max_arb_years:
                         temp_status = f"Arb {arb_year + 1}"
                         if "(S2)" in temp_status: temp_status += " (S2)"
                     else:
                         temp_status = "FA"
                         is_fa = True # Becomes FA *next* year, so this year is last Arb
                 except:
                      temp_status = "FA" # Default to FA on error
                      is_fa = True
             elif temp_status == "Pre-Arb":
                 temp_status = "Arb 1" # Simplified transition

             if is_fa:
                 fa_year_indices.append(index)
                 total_fa_proj_war += player_projections.loc[index, 'proj_WAR']
                 num_fa_years += 1

         # Calculate the average flat salary for FA years
         avg_fa_proj_war = total_fa_proj_war / num_fa_years if num_fa_years > 0 else 0
         avg_fa_salary_m = max(mlb_minimum, avg_fa_proj_war * current_market_rate)
         avg_fa_salary_m = round(avg_fa_salary_m, 2)

         if num_fa_years > 0:
              print(f"Calculated average projected FA WAR: {avg_fa_proj_war:.1f} over {num_fa_years} years.")
              print(f"Applying flat average salary of ${avg_fa_salary_m:.2f}M for projected FA years...")
         # --- End FA Average Calculation ---


         # --- Calculate Estimated Salaries Based on Status (Applying Flat FA Salary) ---
         calculated_salaries = []
         current_status = player_start_status # Reset status for main calculation loop

         for index, row in player_projections.iterrows():
             proj_war = row['proj_WAR']
             projected_fa_value_m = max(0, proj_war * current_market_rate) # Still need this for Arb calc base
             estimated_salary = mlb_minimum
             next_status = current_status

             # Determine salary based on current_status
             if current_status == "FA":
                 estimated_salary = avg_fa_salary_m # Assign pre-calculated average FA salary
                 next_status = "FA"
             elif current_status == "Pre-Arb":
                 estimated_salary = mlb_minimum
                 next_status = "Arb 1"
             elif current_status.startswith("Arb"):
                 try:
                     arb_year_match = re.search(r'\d+', current_status)
                     if arb_year_match:
                         arb_year = int(arb_year_match.group())
                         percentage = arb_percentages.get(arb_year, 1.0)
                         estimated_salary = max(mlb_minimum, projected_fa_value_m * percentage)
                         max_arb_years = 4 if "(S2)" in current_status else 3
                         if arb_year < max_arb_years:
                             next_status = f"Arb {arb_year + 1}"
                             if "(S2)" in current_status: next_status += " (S2)"
                         else:
                             next_status = "FA" # Transition to FA *next* year
                     else:
                         raise ValueError("Could not find arbitration year number")
                 except (ValueError, IndexError) as e:
                      print(f"Warning: Could not parse Arb year from status '{current_status}'. Error: {e}. Assuming FA calculation.")
                      estimated_salary = avg_fa_salary_m # Default to avg FA salary on error
                      next_status = "FA"
             else: # Handle Unknown
                 print(f"Warning: Unknown player status '{current_status}'. Using league minimum.")
                 estimated_salary = mlb_minimum
                 next_status = "Unknown"

             calculated_salaries.append(round(estimated_salary, 2))
             current_status = next_status # Update status for the *next* iteration

         player_projections['Est_Salary_M'] = calculated_salaries

         # --- Calculate Financial Summary (Rest is the same) ---
         total_proj_war = player_projections['proj_WAR'].sum()
         total_salary_m = player_projections['Est_Salary_M'].sum()
         projected_dollar_per_war_cost = (total_salary_m * 1_000_000) / total_proj_war if total_proj_war > 0 else np.inf
         player_projections['Proj_Value_M'] = player_projections['proj_WAR'] * current_market_rate
         player_projections['Surplus_Value_M'] = player_projections['Proj_Value_M'] - player_projections['Est_Salary_M']
         total_surplus_value_m = player_projections['Surplus_Value_M'].sum()

         # --- Display Results ---
         # (Display logic remains the same)
         player_name_display = player_baseline.get('Name', f"ID {target_player_id}") if player_baseline else f"ID {target_player_id}"
         projection_years_display = len(player_projections)
         print(f"\nFinancial Summary for {player_name_display} ({projection_years_display} Years - Starting Status: {player_start_status}):")
         display_df = player_projections[['Year', 'Age', 'proj_WAR', 'Est_Salary_M', 'Proj_Value_M', 'Surplus_Value_M']].copy()
         for col in ['proj_WAR']: display_df[col] = display_df[col].map('{:,.1f}'.format)
         for col in ['Est_Salary_M', 'Proj_Value_M', 'Surplus_Value_M']: display_df[col] = display_df[col].map('${:,.2f}M'.format)
         print(display_df.to_string(index=False))
         # ... (rest of print statements) ...
         print(f"\nTotal Projected WAR: {total_proj_war:.1f}")
         print(f"Total Estimated Salary: ${total_salary_m:.2f} M")
         if np.isfinite(projected_dollar_per_war_cost): print(f"Projected Salary Cost per WAR: ${projected_dollar_per_war_cost / 1_000_000:.2f} M per WAR")
         else: print("Projected Salary Cost per WAR: N/A (Projected WAR is zero or negative)")
         print(f"Total Estimated Surplus Value: ${total_surplus_value_m:.2f} M (based on ${current_market_rate:.1f}M/WAR for {'Hitter' if is_hitter_flag else 'Pitcher'})")

# Final messages if prerequisites were missing
# ... (rest of the block) ...


--- Entering Block 6 ---
--- Running Simplified Variable Check ---
Checking variable: 'player_projections'...
  -> OK: Variable 'player_projections' exists and seems valid.
Checking variable: 'player_baseline'...
  -> OK: Variable 'player_baseline' exists and seems valid.
Checking variable: 'player_start_status'...
  -> OK: Variable 'player_start_status' exists and seems valid.
Checking variable: 'market_rate_hitter'...
  -> OK: Variable 'market_rate_hitter' exists and seems valid.
Checking variable: 'market_rate_pitcher'...
  -> OK: Variable 'market_rate_pitcher' exists and seems valid.
Checking variable: 'target_player_id'...
  -> OK: Variable 'target_player_id' exists and seems valid.
Checking variable: 'is_hitter_flag'...
  -> OK: Variable 'is_hitter_flag' exists and seems valid.
---------------------------------------

--- Calculating Financial Summary with Projected Salaries (Flat FA Years) ---
Using Hitter Market Rate: $8.5M / WAR
Calculated average projected FA WAR: 3.3 over 5

In [None]:
# Block 7: Visualization (Using Matplotlib for Line Plot)

import matplotlib.pyplot as plt
import seaborn as sns # Still needed for bar plot potentially
import numpy as np
import pandas as pd
import warnings

if not player_projections.empty:
    # --- Replace ALL inf values in the DataFrame ---
    print("Checking for and replacing ALL infinite values in DataFrame before plotting...")
    inf_mask = np.isinf(player_projections.select_dtypes(include=[np.number]))
    if inf_mask.any().any():
         print(f"Found infinite values in columns: {inf_mask.any()[inf_mask.any()].index.tolist()}")
         player_projections = player_projections.replace([np.inf, -np.inf], np.nan)
         print("Replaced infinite values with NaN.")
    else:
         print("No infinite values found in numeric columns.")

    # --- Plotting Code ---
    print("Generating plots...")
    plt.figure(figsize=(12, 6))

    # --- Plot Projected WAR vs Age using Matplotlib ---
    plt.subplot(1, 2, 1)
    try:
        # Ensure we drop NaNs for plotting
        plot_data_line = player_projections.dropna(subset=['Age', 'proj_WAR'])
        if not plot_data_line.empty:
            # Use plt.plot instead of sns.lineplot
            plt.plot(plot_data_line['Age'], plot_data_line['proj_WAR'], marker='o', linestyle='-')
            # Set title and labels using plt
            plt.title(f"{player_baseline.get('Name', 'Player')} - Projected WAR by Age")
            plt.xlabel("Age")
            plt.ylabel("Projected WAR")
            plt.grid(True)
        else:
            print("No valid data points to plot for WAR vs Age after handling inf/NaN.")
            plt.title("No WAR data to plot")
            plt.grid(True)
    except Exception as plot_err:
        print(f"Error during line plot generation: {plot_err}")
    # --- End Matplotlib Line Plot ---


    # Plot Projected Value vs Salary (Keep using plt.bar)
    plt.subplot(1, 2, 2)
    plot_data_bar = player_projections.dropna(subset=['Age', 'Proj_Value_M', 'Est_Salary_M'])
    if not plot_data_bar.empty:
        index = plot_data_bar['Age']
        market_rate = locals().get('market_rate_per_war', 8.5)
        plt.bar(index - 0.35/2, plot_data_bar['Proj_Value_M'], 0.35, label=f'Projected Value (${market_rate:.1f}M/WAR)', color='skyblue')
        plt.bar(index + 0.35/2, plot_data_bar['Est_Salary_M'], 0.35, label='Estimated Salary', color='salmon')
        plt.xlabel("Age")
        plt.ylabel("Value / Salary ($ Millions)")
        plt.title("Projected Value vs. Estimated Salary")
        plt.xticks(index)
        plt.legend()
        plt.grid(True, axis='y')
    else:
        print("No valid data points to plot for Value vs Salary after handling inf/NaN.")
        plt.title("No Value/Salary data to plot")
        plt.grid(True)

    plt.tight_layout()
    plt.show()

else:
    print("Skipping visualization as player_projections DataFrame is empty.")