In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Import cleaned batters data
batters = pd.read_csv('../data/cleaned/final_batters_df.csv')

In [3]:
# Inspect dataframe
batters.head().T

Unnamed: 0,0,1,2,3,4
row_id,baergca01_2003,barajro01_2003,belliro01_2003,blanche01_2003,butlebr02_2003
playerID,baergca01,barajro01,belliro01,blanche01,butlebr02
year,2003,2003,2003,2003,2003
position,2B,C,2B,C,2B
age,35,28,28,32,25
avg_salary_year,2573472.948005,2573472.948005,2573472.948005,2573472.948005,2573472.948005
free_agent_salary,1000000.0,500000.0,1100000.0,750000.0,
contract_length,1.0,1.0,1.0,1.0,1.0
AB,389,480,1100,686,553
R,48,40,172,61,85


In [4]:
# Convert columns to cateorical where appropriate
batters['won_mvp'] = batters['won_mvp'].astype('category')
batters['won_gold_glove'] = batters['won_gold_glove'].astype('category')
batters['won_cy_young'] = batters['won_cy_young'].astype('category')
batters['position'] = batters['position'].astype('category')
batters['won_silver_slugger'] = batters['won_silver_slugger'].astype('category')
batters['all_star'] = batters['all_star'].astype('category')

In [5]:
# Create features and targets
X = batters.drop(columns=["row_id", "playerID","year","free_agent_salary","ZR"])
y = batters["free_agent_salary"]

In [6]:
# Only select numerical features for PCA
X = X.select_dtypes(include=[np.number])

# now recombine with y and drop na values
data = pd.concat([X, y], axis=1).dropna()
X = data.drop(columns=["free_agent_salary"])
y = data["free_agent_salary"]

In [7]:
# Compute correlation matrix of feature variables
corr_X = X.corr()

fig_corr_X = px.imshow(
    corr_X,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    aspect="auto",
    title="Correlation Matrix of Original Variables"
)
fig_corr_X.update_layout(
    width=900,
    height=900,
)
fig_corr_X.show()

In [8]:
pca_pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA())
])

pca_pipe.fit(X)

0,1,2
,steps,"[('scaler', ...), ('pca', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [9]:
pca_model = pca_pipe.named_steps["pca"]
explained_var = pca_model.explained_variance_ratio_
cum_explained_var = np.cumsum(explained_var)
n_components = len(explained_var)

ev_df = pd.DataFrame({
    "PC": np.arange(1, n_components + 1),
    "ExplainedVariance": explained_var,
    "CumulativeVariance": cum_explained_var
})


fig_scree = px.line(
    ev_df, x="PC", y="ExplainedVariance",
    markers=True,
    title="Scree Plot: Proportion of Variance Explained"
)
fig_scree.show()


In [10]:
X_pca_scores = pca_pipe.transform(X)
pc_cols = [f"PC{i}" for i in range(1, n_components + 1)]

scores_df = pd.DataFrame(X_pca_scores, columns=pc_cols)
scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26
0,-5.161674,-0.048653,0.023292,-1.210259,0.841005,-0.928488,0.054750,-0.395863,0.382713,-0.100783,...,0.085852,-0.161059,0.220591,-0.172296,-0.069369,-0.062890,0.029300,0.090756,0.065697,0.0
1,-4.530015,-0.960976,0.135548,1.155528,-0.238274,-1.416589,0.411443,-1.355713,-0.011970,-0.569491,...,-0.421537,0.160674,0.539016,0.276002,-0.038237,0.007089,0.036064,0.079808,-0.072624,0.0
2,-0.079378,1.960252,1.476712,-0.161592,-0.568365,-0.877616,0.314040,-0.793870,-1.000285,-0.663816,...,-0.266152,-0.523987,-0.091289,-0.228989,0.193296,0.176622,0.174685,0.061751,-0.104600,0.0
3,-3.123267,-0.494864,0.251344,1.472773,0.587163,-0.881514,-0.482107,-1.017549,-0.425566,0.050462,...,-0.130421,0.424255,0.353865,-0.041338,0.075203,0.053679,0.054287,-0.126667,-0.076679,0.0
4,-2.910070,0.066927,-1.107563,-0.649691,-0.148800,-1.211183,0.356609,-0.633040,-0.486139,-0.751434,...,0.021015,0.240642,0.437227,0.012308,-0.126451,-0.003150,-0.041123,0.056574,-0.003471,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,3.998258,0.461293,-0.037134,-0.668095,-0.733367,1.376773,-0.872809,0.181059,1.862939,0.532090,...,-0.362260,0.698171,-0.343184,-0.092599,0.262195,0.060482,0.014097,-0.182740,-0.077285,0.0
516,4.538626,-0.816778,-2.559508,-0.369909,-1.250761,1.746469,0.836267,-0.408855,0.690837,-0.000569,...,-0.253506,0.078593,-0.636843,-0.212367,-0.018792,0.183473,0.119857,0.124902,-0.156360,0.0
517,6.536282,-4.062709,-0.544028,0.810306,-3.751514,-1.608630,-1.015063,2.090463,0.598419,0.656079,...,-0.810575,-0.031625,-0.187481,-0.305781,-0.225897,0.167063,-0.035735,0.069257,-0.018113,0.0
518,4.974493,-1.551022,-2.831351,-0.125482,-2.480712,0.583468,0.840370,-0.098531,1.148681,0.923055,...,-0.152707,0.382096,0.248136,-0.138833,-0.298367,0.227410,-0.090045,-0.085582,0.032342,0.0


In [11]:
corr_PC = scores_df.corr()

fig_corr_PC = px.imshow(
    corr_PC,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    aspect="auto",
    title="Correlation Matrix of Principal Components"
)
fig_corr_PC.update_layout(
    width=800,
    height=800,
)
fig_corr_PC.show()

In [12]:
loadings = pca_model.components_.T[:, :2]
loading_df = pd.DataFrame(
    loadings,
    index=X.columns,
    columns=["PC1_loading", "PC2_loading"]
)

In [13]:
arrow_scale = 3

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=scores_df["PC1"],
    y=scores_df["PC2"],
    mode="markers",
    name="Batters",
    hoverinfo="text",
    #Add playerID from og dataframe
    hovertext=batters["playerID"]
))


for var_name, row in loading_df.iterrows():
    x_arrow = row["PC1_loading"] * arrow_scale
    y_arrow = row["PC2_loading"] * arrow_scale


    fig.add_trace(go.Scatter(
        x=[0, x_arrow],
        y=[0, y_arrow],
        mode="lines+markers+text",
        text=[None, var_name],
        textposition="top center",
        showlegend=False
    ))

fig.update_layout(
    title="PCA Biplot: PC1 vs PC2 with Variable Directions",
    xaxis_title="PC1",
    yaxis_title="PC2",
    xaxis=dict(zeroline=True),
    yaxis=dict(zeroline=True),
    width=800,
    height=700
)

fig.show()


In [14]:
# Now do PCA regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pcr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=5)),
    ("linreg", LinearRegression())
])


pcr_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,5
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [16]:
train_r2 = pcr_pipe.score(X_train, y_train)
test_r2  = pcr_pipe.score(X_test, y_test)


y_train_pred = pcr_pipe.predict(X_train)
y_test_pred  = pcr_pipe.predict(X_test)


In [17]:
train_rmse = np.sqrt(np.mean((y_train - y_train_pred)**2))
test_rmse  = np.sqrt(np.mean((y_test - y_test_pred)**2))

print(f"Using n_components = {4}")
print(f"Train R²  : {train_r2:.4f}")
print(f"Test  R²  : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test  RMSE: {test_rmse:.4f}")

Using n_components = 4
Train R²  : 0.6019
Test  R²  : 0.5712
Train RMSE: 2813816.9157
Test  RMSE: 3275404.9754
