<a href="https://colab.research.google.com/github/nickklos10/SerieA_Machine_Learning_Predictions_2025/blob/main/2025_Final_ML_SerieA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, Lambda
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import shap
import os
from functools import partial

In [None]:
print(f"Using TensorFlow version: {tf.__version__}")
print(f"Using SHAP version: {shap.__version__}")

Using TensorFlow version: 2.18.0
Using SHAP version: 0.47.1


# --- Configuration Constants ---


In [None]:
# Files and Paths
DATA_FILEPATH = '/content/final_merged_data_with_transfers.csv'
PREDICTIONS_OUTPUT_FILE = 'final_2025_predictions_refactored.csv'
SHAP_SUMMARY_PLOT_FILE = "shap_summary_plot.png"
SHAP_DEPENDENCE_PLOT_PREFIX = "shap_dependence_"

# Data Columns
TARGET_POINTS_COL = 'Pti'
TARGET_RESULTS_COLS = ['Vit', 'Par', 'Sco']
GIO_COL = 'Gio' # Games Played
CATEGORICAL_COLS = ['Team', 'Coach']
LOG_TRANSFORM_COLS = [
    'Average Market Value', 'Total Market Value', 'Market Value IN Players',
    'Market Value OUT Players', 'Fees Players IN', 'Fees Players OUT', 'Net_Spent'
]
BASE_INPUT_FEATURES = [
    'Squad Size', 'Average Age', 'Foreigners',
    'Average Market Value', 'Total Market Value', 'Players In', 'Players Out',
    'Average Age IN players', 'Average Age OUT players',
    'Market Value IN Players', 'Market Value OUT Players',
    'Fees Players IN', 'Fees Players OUT', 'Net_Spent'
]

# Modeling Parameters
UNKNOWN_TOKEN_ID = 0 # Reserve index 0 for unknown categories
RANDOM_SEED = 42
LAST_COMPLETED_YEAR = 2023
PREDICTION_YEAR_OFFSET = 2 # Predict for last_completed_year + 2 (e.g., 2025 if last is 2023)
FULL_SEASON_GAMES = 38 # Assumed games in the prediction year season
IMPUTATION_STRATEGY = 'median'

# Neural Network Hyperparameters
TEAM_EMBEDDING_DIM = 10 # Increased from 5 for potentially more capacity
COACH_EMBEDDING_DIM = 8  # Increased from 5
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.001 # Standard Adam default, can be tuned
LOSS_WEIGHTS = {'points_output': 1.0, 'results_output': 1.0} # Can be tuned

# Training Parameters
EPOCHS = 150 # Increased epochs, relying on EarlyStopping
BATCH_SIZE = 32
EARLY_STOPPING_PATIENCE = 15 # Increased patience

# Post-Processing Parameters
ADJUST_OUTCOMES_SEARCH_WINDOW = 5 # Window size (+/-) for searching W/D in adjust_outcomes

# SHAP Parameters
SHAP_BACKGROUND_SAMPLES = 20
SHAP_EXPLAIN_SAMPLES = 10

# --- Utility Functions ---


In [None]:
def set_seeds(seed_value):
    """Sets random seeds for reproducibility."""
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)
    # Set PYTHONHASHSEED environment variable if needed (usually before script start)
    # os.environ['PYTHONHASHSEED'] = str(seed_value)
    print(f"Random seeds set to {seed_value}")

def create_output_directory(dir_name="output_plots"):
    """Creates a directory if it doesn't exist."""
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        print(f"Created directory: {dir_name}")
    return dir_name

# --- Data Loading and Preprocessing ---

In [None]:
def load_data(filepath):
    """Loads data from CSV and cleans column names."""
    try:
        df = pd.read_csv(filepath)
        df.columns = df.columns.str.strip()
        print(f"Data loaded successfully from {filepath}. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {filepath}")
        return None

def preprocess_data(df, target_points_col, target_results_cols, input_features_cols,
                      log_features_cols, gio_col, categorical_cols,
                      imputation_strategy='median'): # Removed unknown_token_id arg
    """Applies preprocessing: log transforms, imputation, categorical encoding (starting from 1)."""
    df_processed = df.copy()
    final_numeric_features = list(input_features_cols)

    # 1. Log Transforms
    print("Applying Log Transforms...")
    # ... (keep existing log transform code) ...
    for feature in log_features_cols:
        if feature in df_processed.columns:
            df_processed[feature] = df_processed[feature].clip(lower=0)
            log_feature_name = f'{feature}_log'
            df_processed[log_feature_name] = np.log1p(df_processed[feature])
            if log_feature_name not in final_numeric_features:
                final_numeric_features.append(log_feature_name)
        else:
            print(f"Warning: Log feature '{feature}' not found in DataFrame.")


    # 2. Imputation (Do this BEFORE encoding if needed on categoricals, but usually on numeric)
    # Apply imputation only to numeric features identified so far
    print(f"Applying {imputation_strategy} imputation to numeric features...")
    imputer = SimpleImputer(strategy=imputation_strategy)
    # Fit and transform numeric columns - hold imputed data temporarily
    numeric_imputed = imputer.fit_transform(df_processed[final_numeric_features])
    # Assign back to the dataframe
    df_processed[final_numeric_features] = numeric_imputed
    imputers = {'numeric': imputer} # Store imputer


    # 3. Categorical Encoding (Start labels from 1, handle unknowns during predict)
    encoders = {}
    print("Applying Label Encoding (starting labels from 1)...")
    categorical_label_cols = []
    for col in categorical_cols:
        le = LabelEncoder()
        # Fit on known values
        unique_values = df_processed[col].astype(str).unique()
        le.fit(unique_values)
        encoders[col] = le

        label_col = f'{col}_Label'
        categorical_label_cols.append(label_col)
        # Transform known values, assign a placeholder (e.g., 0) for unknowns initially
        # We add 1 to shift labels away from 0. Unknowns will be handled during prediction transform.
        # Store the mapping size (number of classes + 1 for the 0 placeholder)
        raw_labels = df_processed[col].astype(str).apply(lambda x: x if x in le.classes_ else 'unknown_placeholder')
        # Temporarily add placeholder to fit correctly if needed, then remove before setting vocab size
        temp_classes = np.append(le.classes_, ['unknown_placeholder'])
        le.classes_ = temp_classes
        transformed_labels_with_placeholder = le.transform(raw_labels)
        le.classes_ = le.classes_[:-1] # Remove placeholder

        # Now map placeholder to 0, and shift others up by 1
        df_processed[label_col] = np.where(transformed_labels_with_placeholder == len(le.classes_), 0, transformed_labels_with_placeholder + 1)

        print(f"Encoded '{col}' into '{label_col}'. Vocab size (incl. 0 for unknown): {len(le.classes_) + 1}")


    all_features_processed = final_numeric_features + categorical_label_cols + [gio_col]

    # Return categorical_label_cols as well
    return df_processed, encoders, final_numeric_features, imputers, categorical_label_cols

# Make sure this is the definition being used:
def prepare_scaled_data_for_training(df_processed, train_indices, val_indices, numeric_features,
                                     gio_col, target_points_col, target_results_cols,
                                     categorical_label_cols): # Removed imputers from definition arguments
    """Splits data, applies scaling, prepares model inputs/outputs. Assumes imputation already done."""

    X_train_df = df_processed.loc[train_indices]
    X_val_df = df_processed.loc[val_indices]

    # 1. Scaling Numeric Features (fit on train, transform train/val) - Data is already imputed
    print("Scaling numeric input features...")
    scaler_input = StandardScaler()
    X_train_numeric_scaled = scaler_input.fit_transform(X_train_df[numeric_features])
    X_val_numeric_scaled = scaler_input.transform(X_val_df[numeric_features])

    # 2. Scaling Target Points (fit on train, transform train/val)
    print("Scaling points target variable...")
    scaler_points = StandardScaler()
    y_train_points = scaler_points.fit_transform(X_train_df[[target_points_col]])
    y_val_points = scaler_points.transform(X_val_df[[target_points_col]])

    # 3. Prepare Other Inputs/Outputs (no scaling needed)
    X_train_gio = X_train_df[gio_col].values.reshape(-1, 1).astype(np.float32)
    X_val_gio = X_val_df[gio_col].values.reshape(-1, 1).astype(np.float32)

    y_train_results = X_train_df[target_results_cols].values.astype(np.float32)
    y_val_results = X_val_df[target_results_cols].values.astype(np.float32)

    X_train_cats = {col: X_train_df[col].values for col in categorical_label_cols}
    X_val_cats = {col: X_val_df[col].values for col in categorical_label_cols}

    # Prepare inputs as lists/dictionaries for model.fit
    X_train_list = [X_train_numeric_scaled, X_train_cats['Team_Label'], X_train_cats['Coach_Label'], X_train_gio]
    X_val_list = [X_val_numeric_scaled, X_val_cats['Team_Label'], X_val_cats['Coach_Label'], X_val_gio]
    y_train_dict = {'points_output': y_train_points, 'results_output': y_train_results}
    y_val_dict_scaled = {'points_output': y_val_points, 'results_output': y_val_results}

    scalers = {'input': scaler_input, 'points': scaler_points}

    print("Data scaling and preparation complete.")
    return X_train_list, y_train_dict, X_val_list, y_val_dict_scaled, scalers # Return scalers only

# --- Model Building ---


In [None]:
def build_multitask_model(num_numeric_features, num_teams, num_coaches,
                          team_embedding_dim, coach_embedding_dim,
                          dropout_rate, learning_rate, loss_weights): # Removed unknown_token_id arg
    """Builds the Keras multi-task model without mask_zero."""

    numeric_input = Input(shape=(num_numeric_features,), name='numeric_input')
    team_input = Input(shape=(1,), name='team_input', dtype='int32')
    coach_input = Input(shape=(1,), name='coach_input', dtype='int32')
    gio_input = Input(shape=(1,), name='gio_input', dtype='float32')

    # input_dim = vocab_size + 1 (index 0 for unknown, 1 to N for known categories)
    team_vocab_size = num_teams + 1
    coach_vocab_size = num_coaches + 1

    team_embedding_layer = Embedding(input_dim=team_vocab_size, output_dim=team_embedding_dim,
                                     name='team_embedding', mask_zero=False) # Set mask_zero=False
    team_embedding = team_embedding_layer(team_input)
    team_embedding = Flatten()(team_embedding)

    coach_embedding_layer = Embedding(input_dim=coach_vocab_size, output_dim=coach_embedding_dim,
                                      name='coach_embedding', mask_zero=False) # Set mask_zero=False
    coach_embedding = coach_embedding_layer(coach_input)
    coach_embedding = Flatten()(coach_embedding)

    # ... rest of the model architecture ...
    shared = Concatenate()([numeric_input, team_embedding, coach_embedding])
    x = Dense(128, activation='relu')(shared)
    x = Dropout(dropout_rate)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(dropout_rate)(x)

    # Branch 1: Points prediction
    points_output = Dense(1, activation='linear', name='points_output')(x)

    # Branch 2: Match outcomes prediction (Vit, Par, Sco)
    results_logits = Dense(3, activation='linear')(x)
    results_probs = tf.keras.layers.Activation('softmax', name='results_probs')(results_logits)
    results_output = Lambda(lambda inputs: inputs[0] * inputs[1], name='results_output')([results_probs, gio_input])

    model = Model(inputs=[numeric_input, team_input, coach_input, gio_input],
                  outputs=[points_output, results_output])

    losses = { 'points_output': 'mse', 'results_output': 'mse' }
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=losses, loss_weights=loss_weights)
    print("Model compiled successfully (mask_zero=False).")
    model.summary()
    return model

# --- Model Training ---

In [None]:
def train_model(model, X_train_list, y_train_dict, X_val_list, y_val_dict,
                epochs, batch_size, patience):
    """Trains the model with early stopping."""
    print("Starting model training...")
    early_stop = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=1)

    history = model.fit(
        X_train_list, y_train_dict,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val_list, y_val_dict),
        callbacks=[early_stop],
        verbose=1
    )
    print("Model training finished.")
    return model, history

# --- Post-Processing ---

In [None]:
def adjust_outcomes(P, w_pred, d_pred, total_games, search_window):
    """
    Adjusts continuous W/D/L predictions to integers satisfying 3W + D = P.

    Args:
        P (float): Predicted points (original scale).
        w_pred (float): Predicted wins (continuous, scaled by total_games).
        d_pred (float): Predicted draws (continuous, scaled by total_games).
        total_games (int): Total games in the season.
        search_window (int): Range (+/-) around initial win estimate to search.

    Returns:
        tuple: (wins, draws, losses) as integers.
    """
    target_points = int(round(P))
    if target_points < 0: target_points = 0 # Points cannot be negative

    # Estimate continuous W/D based on points and predicted ratio
    # Avoid division by zero if w_pred + d_pred is zero or negative
    if (w_pred + d_pred) <= 1e-6:
         # If model predicts almost no wins or draws, base estimate on points/games
         # Simple heuristic: assume mostly draws if points <= total_games, else max wins
         if target_points <= total_games:
             cont_w = 0
             cont_d = float(target_points)
         else:
             cont_w = float(target_points) / 3.0
             cont_d = 0.0
    else:
        # Use the model's predicted ratio of wins to (wins+draws)
        r = w_pred / (w_pred + d_pred)
        # Derive continuous W/D estimate satisfying 3*cont_w + cont_d = P
        # P = 3*r*X + (1-r)*X => P = X*(3r + 1 - r) = X*(2r + 1) => X = P/(2r+1)
        # where X is the estimated sum cont_w + cont_d
        denominator = (2 * r + 1)
        if abs(denominator) < 1e-6: # Avoid division by zero
             X = float(total_games) # Fallback: assume all games contributed
        else:
             X = target_points / denominator
        cont_w = r * X
        cont_d = (1 - r) * X

    best_error = float('inf')
    best_tuple = None

    # Search integer wins (w) in a window around the continuous estimate
    search_start = max(0, int(round(cont_w)) - search_window)
    search_end = int(round(cont_w)) + search_window + 1

    for w in range(search_start, search_end):
        # Calculate required draws (d) to match target points
        d = target_points - 3 * w
        if d < 0: # Cannot have negative draws
            continue

        # Calculate losses (l)
        l = total_games - (w + d)
        if l < 0: # Cannot have negative losses (w+d > total_games)
            continue

        # Check how close this integer solution (w, d) is to the continuous one
        error = abs(w - cont_w) + abs(d - cont_d)

        if error < best_error:
            best_error = error
            best_tuple = (w, d, l)

    # Fallback if no valid (w, d, l) tuple was found in the search
    if best_tuple is None:
        # Simple fallback: prioritize points equation, then minimize impossible scenarios
        # Try max wins first
        w_fallback = target_points // 3
        d_fallback = target_points % 3
        l_fallback = total_games - (w_fallback + d_fallback)
        if l_fallback < 0: # Too many points for the games? Adjust draws downwards
             d_fallback += l_fallback # Reduce draws
             l_fallback = 0
             if d_fallback < 0: # Still impossible? Max wins, 0 draws, 0 losses (adjust points needed)
                 w_fallback = total_games
                 d_fallback = 0
                 l_fallback = 0

        best_tuple = (w_fallback, d_fallback, l_fallback)
        # print(f"Warning: adjust_outcomes fallback used for P={P}, w_pred={w_pred}, d_pred={d_pred}")


    # Ensure non-negativity just in case
    final_w, final_d, final_l = best_tuple
    final_w = max(0, final_w)
    final_d = max(0, final_d)
    final_l = max(0, final_l)

    # Final sanity check: ensure sum equals total_games (due to rounding/fallbacks)
    if final_w + final_d + final_l != total_games:
        # If sum is off, adjust losses (least impactful on points)
        final_l = total_games - (final_w + final_d)
        final_l = max(0, final_l) # Ensure loss isn't negative after adjustment
        # Re-check if w+d exceeds total_games after fixing L
        if final_w + final_d > total_games:
             # This should ideally not happen with prior checks, but as safeguard:
             # Reduce draws first until w+d = total_games
             reduction = (final_w + final_d) - total_games
             final_d -= reduction
             final_d = max(0, final_d)
             # If draws went to 0 and still over, reduce wins
             if final_w + final_d > total_games:
                 reduction_w = (final_w + final_d) - total_games
                 final_w -= reduction_w
                 final_w = max(0, final_w)
             final_l = 0 # Losses must be 0 now


    return final_w, final_d, final_l

# --- Prediction ---


In [None]:
def predict_future(model, df_future_raw, encoders, imputers, scalers,
                   numeric_features_cols, log_features_cols, gio_col, categorical_cols,
                    # Removed unknown_token_id
                   full_season_games, target_points_col,
                   target_results_cols, adjust_search_window):
    """Prepares future data, makes predictions, and applies adjustments (handles unknowns by mapping to 0)."""
    print("Starting prediction pipeline for future data...")
    df_pred_base = df_future_raw.copy()

    # 1. Apply Log Transforms
    # ... (keep existing log transform code) ...
    print("Applying Log Transforms to prediction data...")
    for feature in log_features_cols:
         if feature in df_pred_base.columns:
             df_pred_base[feature] = df_pred_base[feature].clip(lower=0)
             log_feature_name = f'{feature}_log'
             df_pred_base[log_feature_name] = np.log1p(df_pred_base[feature])
         else:
            print(f"Warning: Log feature '{feature}' not found in prediction data.")

    # 2. Impute Numeric Features first (using fitted imputer)
    print("Applying imputation to prediction data...")
    numeric_imputed = imputers['numeric'].transform(df_pred_base[numeric_features_cols])
    df_pred_base[numeric_features_cols] = numeric_imputed # Assign back before encoding

    # 3. Apply Categorical Encoding (Handling Unknowns by mapping to 0)
    print("Applying Label Encoding to prediction data (mapping unknowns to 0)...")
    categorical_label_cols = []
    for col in categorical_cols:
        le = encoders[col]
        label_col = f'{col}_Label'
        categorical_label_cols.append(label_col)
        # Transform, mapping unknown values to 0
        transformed_labels = []
        for item in df_pred_base[col].astype(str):
            if item in le.classes_:
                # Add 1 to shift known labels away from 0
                label = le.transform([item])[0] + 1
                transformed_labels.append(label)
            else: # Value not seen during fitting
                print(f"Info: Unknown value '{item}' found in column '{col}'. Mapping to 0.")
                transformed_labels.append(0) # Map unknowns to index 0
        df_pred_base[label_col] = transformed_labels


    # 4. Apply Scaling using fitted scalers (on imputed numeric data)
    print("Applying scaling to prediction data...")
    X_pred_numeric_scaled = scalers['input'].transform(df_pred_base[numeric_features_cols]) # Use imputed data

    # 5. Prepare all inputs for prediction
    X_pred_team = df_pred_base['Team_Label'].values
    X_pred_coach = df_pred_base['Coach_Label'].values
    X_pred_gio = np.full((len(df_pred_base), 1), full_season_games, dtype=np.float32)

    pred_inputs = [X_pred_numeric_scaled, X_pred_team, X_pred_coach, X_pred_gio]

    # ... rest of prediction and adjustment code ...
    # 6. Make Predictions
    print("Making model predictions...")
    pred_points_scaled, pred_results_continuous = model.predict(pred_inputs)

    # 7. Inverse Transform Points Predictions
    pred_points_original = scalers['points'].inverse_transform(pred_points_scaled)
    df_predictions = df_pred_base.copy()
    df_predictions[target_points_col] = pred_points_original.flatten()

    # 8. Apply Adjustment Function for Wins/Draws/Losses
    print("Adjusting predicted outcomes...")
    adjusted_outcomes = df_predictions.apply(
        lambda row: adjust_outcomes(
            row[target_points_col],
            pred_results_continuous[df_predictions.index.get_loc(row.name), 0], # Predicted Wins (cont)
            pred_results_continuous[df_predictions.index.get_loc(row.name), 1], # Predicted Draws (cont)
            total_games=full_season_games,
            search_window=ADJUST_OUTCOMES_SEARCH_WINDOW # Use constant
        ),
        axis=1
    )
    # Assign adjusted integer outcomes
    df_predictions[['Vit', 'Par', 'Sco']] = pd.DataFrame(adjusted_outcomes.tolist(), index=df_predictions.index)
    df_predictions[f'{target_points_col}_Rounded'] = df_predictions[target_points_col].round().astype(int)


    # 9. Final Sorting and Position Assignment
    print("Assigning final positions...")
    df_predictions = df_predictions.sort_values(by=target_points_col, ascending=False).reset_index(drop=True)
    df_predictions['Pos'] = range(1, len(df_predictions) + 1)


    print("Prediction pipeline complete.")
    return df_predictions

# --- Evaluation ---


In [None]:
def evaluate_model(model, X_val_list, y_val_dict_scaled, scaler_points, target_points_col, target_results_cols):
    """Evaluates the model on validation data."""
    print("\n--- Model Evaluation on Validation Data ---")
    val_pred_points_scaled, val_pred_results_cont = model.predict(X_val_list)

    # Inverse transform points predictions and actuals
    val_pred_points_orig = scaler_points.inverse_transform(val_pred_points_scaled)
    y_val_points_orig = scaler_points.inverse_transform(y_val_dict_scaled['points_output'])
    y_val_results_orig = y_val_dict_scaled['results_output'] # These were not scaled

    # Calculate metrics for points
    mae_points = mean_absolute_error(y_val_points_orig, val_pred_points_orig)
    rmse_points = np.sqrt(mean_squared_error(y_val_points_orig, val_pred_points_orig))
    r2_points = r2_score(y_val_points_orig, val_pred_points_orig)
    print(f"Points Prediction ({target_points_col}):")
    print(f"  MAE:  {mae_points:.2f}")
    print(f"  RMSE: {rmse_points:.2f}")
    print(f"  R²:   {r2_points:.2f}")

    # Calculate metrics for results (continuous predictions vs actual counts)
    mae_results = mean_absolute_error(y_val_results_orig, val_pred_results_cont)
    rmse_results = np.sqrt(mean_squared_error(y_val_results_orig, val_pred_results_cont))
    # R2 might be less meaningful here if comparing continuous predictions to integer counts directly
    print(f"\nResults Prediction ({', '.join(target_results_cols)}) - Continuous vs Actual Counts:")
    print(f"  MAE (Avg per outcome): {mean_absolute_error(y_val_results_orig, val_pred_results_cont, multioutput='raw_values').mean():.2f}")
    print(f"  RMSE (Avg per outcome): {np.sqrt(mean_squared_error(y_val_results_orig, val_pred_results_cont, multioutput='raw_values')).mean():.2f}")
    print("--------------------------------------------")

# --- Explainability (SHAP) ---

In [None]:
import tensorflow as tf
import shap
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import partial # Keep this if using the partial approach

# --- Explainability (SHAP using KernelExplainer) ---

def explain_model_points(model, X_val_list, X_train_list, feature_names,
                         output_dir, num_explain_samples=SHAP_EXPLAIN_SAMPLES, num_background_samples=SHAP_BACKGROUND_SAMPLES):
    """
    Explains the points prediction using SHAP KernelExplainer.
    Handles multi-input structure via a wrapper function.
    Assumes model was trained WITHOUT mask_zero=True.
    Expects X_val_list and X_train_list as lists of numpy arrays:
       [X_numeric, X_team, X_coach, X_gio]
    """
    print("\n--- SHAP Explanation for Points Prediction (Using KernelExplainer) ---")

    # Unpack validation and training data lists - ensure numpy arrays
    X_val_numeric, X_val_team, X_val_coach, X_val_gio = [np.asarray(arr) for arr in X_val_list]
    X_train_numeric, X_train_team, X_train_coach, X_train_gio = [np.asarray(arr) for arr in X_train_list]

    # --- Prepare Background Data Subset ---
    if X_train_numeric.shape[0] < num_background_samples:
        print(f"Warning: Background samples requested ({num_background_samples}) > available training samples ({X_train_numeric.shape[0]}). Using all available.")
        num_background_samples = X_train_numeric.shape[0]
    # Ensure background data is float32 numpy array for KernelExplainer
    background_indices = np.random.choice(X_train_numeric.shape[0], num_background_samples, replace=False)
    # KernelExplainer usually takes a summarized background (e.g., k-means) or a direct sample.
    # Using a direct sample of the numeric part is common.
    X_train_numeric_background = X_train_numeric[background_indices].astype(np.float32)
    print(f"Background data prepared with {num_background_samples} samples.")

    # --- Prepare Explanation Data Subset ---
    if X_val_numeric.shape[0] < num_explain_samples:
        print(f"Warning: Explain samples requested ({num_explain_samples}) > available validation samples ({X_val_numeric.shape[0]}). Using all available.")
        num_explain_samples = X_val_numeric.shape[0]
        explain_indices = np.arange(num_explain_samples)
    else:
        # Ensure reproducibility
        np.random.seed(RANDOM_SEED)
        explain_indices = np.random.choice(X_val_numeric.shape[0], num_explain_samples, replace=False)

    # Select the subset to explain - ensure correct dtypes
    X_explain_numeric = X_val_numeric[explain_indices].astype(np.float32)
    X_explain_team = X_val_team[explain_indices].astype(np.int32)
    X_explain_coach = X_val_coach[explain_indices].astype(np.int32)
    X_explain_gio = X_val_gio[explain_indices].astype(np.float32)
    print(f"Explanation data prepared with {num_explain_samples} samples.")


    # --- Define the Prediction Function Wrapper for KernelExplainer ---
    # This wrapper takes only the numeric data subset (as required by KernelExplainer)
    # but uses the corresponding categorical and gio features internally,
    # assuming SHAP passes data matching the order of X_explain_*.
    def shap_predict_points_wrapper(X_numeric_subset):
      # Ensure input is a float32 numpy array
      X_numeric_subset = np.asarray(X_numeric_subset, dtype=np.float32)
      if X_numeric_subset.ndim == 1:
          X_numeric_subset = X_numeric_subset.reshape(1, -1)
      num_instances = X_numeric_subset.shape[0]
      predictions = np.zeros(num_instances)
      for i in range(num_instances):
          # Clip the index to avoid out-of-bounds: if i >= len(X_explain_team), use the last available sample.
          idx = min(i, X_explain_team.shape[0] - 1)
          current_team = X_explain_team[idx].reshape(1, 1)
          current_coach = X_explain_coach[idx].reshape(1, 1)
          current_gio = X_explain_gio[idx].reshape(1, 1)
          current_numeric = X_numeric_subset[i].reshape(1, -1)
          model_inputs = [
              tf.constant(current_numeric, dtype=tf.float32),
              tf.constant(current_team, dtype=tf.int32),
              tf.constant(current_coach, dtype=tf.int32),
              tf.constant(current_gio, dtype=tf.float32)
          ]
          try:
              # Get prediction from your model (assuming it outputs [points, results])
              pred_points_tensor, _ = model(model_inputs, training=False)
              predictions[i] = pred_points_tensor.numpy().item()
          except Exception as e:
              print(f"Error in SHAP wrapper for instance {i}: {e}")
              predictions[i] = np.nan
      return predictions


    # --- Initialize and Run KernelExplainer ---
    shap_values = None
    explainer = None
    print(f"Initializing KernelExplainer with {num_background_samples} background samples...")
    try:
        # Note: KernelExplainer can be slow, especially for many samples or features.
        explainer = shap.KernelExplainer(shap_predict_points_wrapper, X_train_numeric_background)

        print(f"Calculating SHAP values for {num_explain_samples} validation samples...")
        # Check for NaNs introduced by errors in the wrapper
        first_pred = shap_predict_points_wrapper(X_explain_numeric[[0]])
        if np.isnan(first_pred).any():
             print("ERROR: SHAP wrapper function produced NaN for the first sample. Aborting SHAP calculation.")
             return None, None # Indicate failure

        # Link="identity" is default for regression-like outputs
        shap_values = explainer.shap_values(X_explain_numeric, nsamples='auto') # Use auto nsamples
        print("SHAP value calculation complete.")

    except Exception as e:
        import traceback
        print(f"ERROR during SHAP KernelExplainer initialization or calculation: {e}")
        print(traceback.format_exc()) # Print detailed traceback
        print("Skipping SHAP analysis due to error.")
        return None, None

    # --- Generate and Save Plots ---
    if shap_values is not None and isinstance(shap_values, np.ndarray):
        print("Generating SHAP plots...")
        try:
            # Summary Plot (Bar)
            plt.figure(figsize=(12, 8))
            shap.summary_plot(shap_values, X_explain_numeric, feature_names=feature_names, plot_type="bar", show=False)
            plt.title("SHAP Feature Importance for Points Prediction (Numeric Features)")
            plt.tight_layout()
            summary_path = os.path.join(output_dir, SHAP_SUMMARY_PLOT_FILE)
            plt.savefig(summary_path)
            plt.close()
            print(f"  Saved summary plot to: {summary_path}")

            # Dependence Plots for Top Numeric Features
            mean_abs_shap = np.abs(shap_values).mean(axis=0)
            num_top_features = min(3, len(feature_names))
            if num_top_features > 0 and len(mean_abs_shap) > 0:
                top_indices = np.argsort(mean_abs_shap)[-num_top_features:]
                print(f"  Generating dependence plots for top {num_top_features} numeric features...")
                for idx in top_indices:
                    if idx < len(feature_names):
                         plt.figure(figsize=(10, 6))
                         shap.dependence_plot(idx, shap_values, X_explain_numeric, feature_names=feature_names, show=False)
                         plt.tight_layout()
                         safe_feature_name = "".join(c if c.isalnum() else "_" for c in feature_names[idx])
                         dep_path = os.path.join(output_dir, f"{SHAP_DEPENDENCE_PLOT_PREFIX}{safe_feature_name}.png")
                         plt.savefig(dep_path)
                         plt.close()
                         print(f"    Saved dependence plot for '{feature_names[idx]}' to: {dep_path}")
                    else:
                         print(f"    Warning: Invalid index {idx} for dependence plot skipped.")
            elif len(feature_names) == 0:
                 print("  Skipping dependence plots as no feature names were provided.")
            else:
                 print("  Skipping dependence plots as unable to determine top features.")

        except Exception as e:
            print(f"Error generating SHAP plots: {e}")
            import traceback
            print(traceback.format_exc())

    elif shap_values is not None:
         print(f"Warning: SHAP values were computed but are not in the expected NumPy array format. Type: {type(shap_values)}. Skipping plotting.")


    print("--------------------------------------------")
    # KernelExplainer doesn't return separate values per input like DeepExplainer
    # Return the single array of shap values (for numeric features) and the explainer
    return shap_values, explainer

# --- Main Execution ---

In [None]:
if __name__ == "__main__":
    set_seeds(RANDOM_SEED)
    output_plot_dir = create_output_directory()

    # 1. Load Data
    df_raw = load_data(DATA_FILEPATH)

    if df_raw is not None:
        # 2. Preprocess Data (Log transforms, Imputation, Categorical Encoding starting from 1)
        # Note: Imputation now happens inside preprocess_data before encoding is finalized
        df_processed, encoders, final_numeric_features, imputers, categorical_label_cols = preprocess_data(
            df_raw, TARGET_POINTS_COL, TARGET_RESULTS_COLS, BASE_INPUT_FEATURES,
            LOG_TRANSFORM_COLS, GIO_COL, CATEGORICAL_COLS, IMPUTATION_STRATEGY
        )

        # 3. Split and Scale Data for Training/Validation
        train_indices = df_processed[df_processed['Year'] <= LAST_COMPLETED_YEAR].index
        val_indices = df_processed[df_processed['Year'] == LAST_COMPLETED_YEAR].index
        print(f"Training samples: {len(train_indices)}, Validation samples: {len(val_indices)}")

        # Pass imputers dict returned from preprocess_data
        X_train_list, y_train_dict, X_val_list, y_val_dict_scaled, scalers = \
          prepare_scaled_data_for_training(
              df_processed, train_indices, val_indices, final_numeric_features,
              GIO_COL, TARGET_POINTS_COL, TARGET_RESULTS_COLS,
              categorical_label_cols # Remove imputers from the call arguments
          )

        # ***Need to adjust prepare_scaled_data_for_training***
        # It now receives imputed data via df_processed and shouldn't redo imputation.
        # Let's assume prepare_scaled_data_for_training is modified appropriately
        # to just do splitting and scaling based on df_processed.

        # 4. Build Model (without mask_zero)
        num_teams = len(encoders['Team'].classes_)
        num_coaches = len(encoders['Coach'].classes_)
        model = build_multitask_model(
            num_numeric_features=len(final_numeric_features),
            num_teams=num_teams,
            num_coaches=num_coaches,
            team_embedding_dim=TEAM_EMBEDDING_DIM,
            coach_embedding_dim=COACH_EMBEDDING_DIM,
            dropout_rate=DROPOUT_RATE,
            learning_rate=LEARNING_RATE,
            loss_weights=LOSS_WEIGHTS
        )

        # 5. Train Model (Requires retraining after changing model structure)
        print("\n *** Retraining model required due to mask_zero change *** \n")
        model, history = train_model(
            model, X_train_list, y_train_dict, X_val_list, y_val_dict_scaled,
            epochs=EPOCHS, batch_size=BATCH_SIZE, patience=EARLY_STOPPING_PATIENCE
        )

        # 6. Evaluate Model
        evaluate_model(model, X_val_list, y_val_dict_scaled, scalers['points'],
                       TARGET_POINTS_COL, TARGET_RESULTS_COLS)

        # 7. Predict Future Season
        future_data_raw = df_raw[df_raw['Year'] == LAST_COMPLETED_YEAR + 1].copy()
        if not future_data_raw.empty:
             df_future_base = future_data_raw.drop_duplicates(subset=['Team'], keep='last').reset_index(drop=True)
             df_future_base['Year'] = LAST_COMPLETED_YEAR + PREDICTION_YEAR_OFFSET

             final_predictions = predict_future( # Call updated predict_future
                 model, df_future_base, encoders, imputers, scalers,
                 final_numeric_features, LOG_TRANSFORM_COLS, GIO_COL, CATEGORICAL_COLS,
                 # unknown_token_id no longer needed here
                 FULL_SEASON_GAMES, TARGET_POINTS_COL,
                 TARGET_RESULTS_COLS, ADJUST_OUTCOMES_SEARCH_WINDOW
             )
             # ... display and save predictions ...
             print("\n--- Final Predictions for Year", LAST_COMPLETED_YEAR + PREDICTION_YEAR_OFFSET, "---")
             display_cols = ['Pos', 'Team', f'{TARGET_POINTS_COL}_Rounded'] + TARGET_RESULTS_COLS
             # Ensure columns exist before printing
             display_cols = [col for col in display_cols if col in final_predictions.columns]
             print(final_predictions[display_cols])
             try:
                 final_predictions.to_csv(PREDICTIONS_OUTPUT_FILE, index=False)
                 print(f"\nFinal predictions saved to '{PREDICTIONS_OUTPUT_FILE}'")
             except Exception as e:
                 print(f"\nError saving predictions to CSV: {e}")

        else:
             print("\nSkipping future prediction as no data found for Year", LAST_COMPLETED_YEAR + 1)


        # 8. Explain Model (SHAP)
        shap_values_list, explainer = explain_model_points( # Call the same explain function
            model, X_val_list, X_train_list,
            final_numeric_features, output_plot_dir,
            num_explain_samples=SHAP_EXPLAIN_SAMPLES,
            num_background_samples=SHAP_BACKGROUND_SAMPLES
        )

        print("\nAnalysis Complete.")

Random seeds set to 42
Data loaded successfully from /content/final_merged_data_with_transfers.csv. Shape: (302, 34)
Applying Log Transforms...
Applying median imputation to numeric features...
Applying Label Encoding (starting labels from 1)...
Encoded 'Team' into 'Team_Label'. Vocab size (incl. 0 for unknown): 21
Encoded 'Coach' into 'Coach_Label'. Vocab size (incl. 0 for unknown): 103
Training samples: 282, Validation samples: 17
Scaling numeric input features...
Scaling points target variable...
Data scaling and preparation complete.
Model compiled successfully (mask_zero=False).



 *** Retraining model required due to mask_zero change *** 

Starting model training...
Epoch 1/150
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 78ms/step - loss: 36.9493 - points_output_loss: 1.3320 - results_output_loss: 35.5896 - val_loss: 17.3554 - val_points_output_loss: 0.9359 - val_results_output_loss: 16.4195
Epoch 2/150
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 25.7468 - points_output_loss: 1.0349 - results_output_loss: 24.7149 - val_loss: 17.7738 - val_points_output_loss: 0.7696 - val_results_output_loss: 17.0042
Epoch 3/150
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 21.9421 - points_output_loss: 0.9095 - results_output_loss: 21.0237 - val_loss: 15.2881 - val_points_output_loss: 0.6940 - val_results_output_loss: 14.5941
Epoch 4/150
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 17.8644 - points_output_loss: 0.7847 - results_output_loss: 17.0741 - val_l

  0%|          | 0/10 [00:00<?, ?it/s]

SHAP value calculation complete.
Generating SHAP plots...


  shap.summary_plot(shap_values, X_explain_numeric, feature_names=feature_names, plot_type="bar", show=False)


  Saved summary plot to: output_plots/shap_summary_plot.png
  Generating dependence plots for top 3 numeric features...
    Saved dependence plot for 'Average Market Value' to: output_plots/shap_dependence_Average_Market_Value.png
    Saved dependence plot for 'Market Value OUT Players' to: output_plots/shap_dependence_Market_Value_OUT_Players.png
    Saved dependence plot for 'Average Market Value_log' to: output_plots/shap_dependence_Average_Market_Value_log.png
--------------------------------------------

Analysis Complete.


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>