# Model Pruning and Regularization

Controlling model complexity through depth constraints, feature reduction, and regularization


In [None]:
#Libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Regularization and Structural Pruning Configuration


In [None]:
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    n_estimators=300,        # fewer trees
    learning_rate=0.05,
    max_depth=4,             # was 6
    min_child_weight=5,      # minimum samples per leaf
    subsample=0.8,
    colsample_bytree=0.7,    # less feature sampling -> less variance
    reg_lambda=1.0,          # L2 regularization
    reg_alpha=0.5,           # L1 regularization
    tree_method="hist",
    random_state=2025
)


## Evaluation Framework
To consistently compare pruned and regularized model variants, a unified evaluation function was implemented to report RMSE, MAE, and R² on both training and test sets.


In [None]:
def evaluate_model(pipeline, X_train, X_test, y_train, y_test):
    pipeline.fit(X_train, y_train)

    # Training predictions
    y_pred_train = pipeline.predict(X_train)
    rmse_train = (mean_squared_error(y_train, y_pred_train)) ** 0.5
    mae_train = mean_absolute_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)

    # Testing predictions
    y_pred_test = pipeline.predict(X_test)
    rmse_test = (mean_squared_error(y_test, y_pred_test)) ** 0.5
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    print("=== TRAINING RESULTS ===")
    print(f"RMSE Train: {rmse_train:,.2f}")
    print(f"MAE Train:  {mae_train:,.2f}")
    print(f"R² Train:   {r2_train:.3f}")
    print("")
    print("=== TESTING RESULTS ===")
    print(f"RMSE Test: {rmse_test:,.2f}")
    print(f"MAE Test:  {mae_test:,.2f}")
    print(f"R² Test:   {r2_test:.3f}")

    return {
        "rmse_train": rmse_train, "rmse_test": rmse_test,
        "mae_train": mae_train, "mae_test": mae_test,
        "r2_train": r2_train, "r2_test": r2_test
    }


## Option A – Pruned and Regularized Baseline


In [None]:
xgb_reg_A = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_lambda=1.0,
    reg_alpha=0.5,
    tree_method="hist",
    random_state=2025
)

pipeline_A = Pipeline([
    ("preprocess", preprocessor),
    ("model", xgb_reg_A),
])

print("===== OPTION A RESULTS =====")
results_A = evaluate_model(pipeline_A, X_train, X_test, y_train, y_test)


## Option B – Reduced Feature Set + Pruned Model


In [None]:
# 1. Reduced feature set
### Reduced Feature Set
reduced_features = [
    "PTS", "MP", "Age",
    "AST", "TRB",
    "FT", "FTA",
    "STL", "BLK",
    "G", "GS",
    "Year", "Team"
]

X_red = df_reg[reduced_features]
y_red = df_reg["Salary"]

# 2. Train/test split for Option B
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(
    X_red, y_red, test_size=0.2, random_state=2025
)

# 3. NEW preprocessor for Option B (based ONLY on X_red)
numeric_features_B = X_red.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_B = X_red.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer_B = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer_B = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor_B = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_B, numeric_features_B),
        ("cat", categorical_transformer_B, categorical_features_B),
    ]
)

# 4. Use the same tuned XGBoost params from Option A
xgb_reg_A = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_lambda=1.0,
    reg_alpha=0.5,
    tree_method="hist",
    random_state=2025
)

pipeline_B = Pipeline([
    ("preprocess", preprocessor_B),
    ("model", xgb_reg_A),
])

print("===== OPTION B RESULTS =====")
results_B = evaluate_model(pipeline_B, X_train_B, X_test_B, y_train_B, y_test_B)
