# Baseline Salary Prediction Model


## Feature Selection and Modeling Dataset Preparation

In [None]:
# Use 2020–2024 for training & feature selection
df_fs = df[df["Year"].between(2019, 2024)].copy()

target = "Salary"
X = df_fs.drop(columns=["Player", "Salary"])   # keep Year + Team as features
y = df_fs[target]

## Correlation Analysis with Salary (Feature Relevance Assessment)


In [None]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

corr = df_fs[numeric_cols.tolist() + ["Salary"]].corr()["Salary"].sort_values(ascending=False)
corr


## Mutual Information Feature Relevance Analysis


In [None]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer

# Take only numeric columns
numeric_X = X[numeric_cols].copy()

# 1. Impute missing values with the median for each column
imputer = SimpleImputer(strategy="median")
numeric_X_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_X),
    columns=numeric_cols
)

# 2. Compute Mutual Information on the imputed data
mi_scores = mutual_info_regression(numeric_X_imputed, y, random_state=42)

mi = pd.Series(mi_scores, index=numeric_cols).sort_values(ascending=False)
mi


## Feature Importance and Preprocessing Pipeline Construction


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])


## Baseline Model Definition and Construction


In [None]:
baseline_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method="hist",
        random_state=42
    ))
])

baseline_model.fit(X, y)


## Feature Importance Analysis and Interpretation


In [None]:
model = baseline_model.named_steps["model"]
feature_names = baseline_model.named_steps["preprocess"].get_feature_names_out()

feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

feat_imp.head(25)


## Final Feature Set Definition

In [None]:
final_features = [
    "PTS", "MP", "Age",
    "AST", "TOV",
    "2P", "2PA", "FGA",
    "FT", "FTA",
    "TRB", "DRB", "STL", "BLK",
    "G", "GS",
    "Year", "Team"
]


## Modeling Dataset Construction

In [None]:
df_reg = df[df["Year"].between(2019, 2024)].copy()

target = "Salary"
X = df_reg[final_features]
y = df_reg[target]


## Preprocessing Pipeline Construction


In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


## Baseline XGBoost Model Definition


In [None]:
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    random_state=2025
)

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_reg),
])


## Train–Test Split and Model Training


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2025
)

xgb_pipeline.fit(X_train, y_train)


## Baseline Model Evaluation


In [None]:
y_pred = xgb_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:,.2f}")
print(f"MAE: {mae:,.2f}")
print(f"R²: {r2:.3f}")
