In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import plotly.express as px
import plotly.graph_objects as go

In [7]:
# Import cleaned pitchers data
pitchers = pd.read_csv('../data/cleaned/final_pitchers_df.csv')

In [8]:
# Inspect dataframe
pitchers.head().T

Unnamed: 0,0,1,2,3,4
row_id,abbotpa01_2003,almanar01_2003,almoned01_2003,alvarwi01_2003,batismi01_2003
playerID,abbotpa01,almanar01,almoned01,alvarwi01,batismi01
year,2003,2003,2003,2003,2003
position,P,P,P,P,P
age,36,31,27,33,32
avg_salary_year,2573472.948005,2573472.948005,2573472.948005,2573472.948005,2573472.948005
free_agent_salary,600000.0,500000.0,,1500000.0,4366666.666667
contract_length,1.0,1.0,1.0,1.0,3.0
W,19.0,9.0,0.0,8.0,29.0
L,9.0,9.0,0.0,5.0,26.0


In [9]:
# Convert columns to cateorical where appropriate
pitchers['won_mvp'] = pitchers['won_mvp'].astype('category')
pitchers['won_gold_glove'] = pitchers['won_gold_glove'].astype('category')
pitchers['won_cy_young'] = pitchers['won_cy_young'].astype('category')
pitchers['position'] = pitchers['position'].astype('category')
pitchers['won_silver_slugger'] = pitchers['won_silver_slugger'].astype('category')
pitchers['all_star'] = pitchers['all_star'].astype('category')

In [10]:
# Create features and targets
X = pitchers.drop(columns=["row_id", "playerID","year","free_agent_salary","ZR"])
y = pitchers["free_agent_salary"]

In [11]:
# Only select numerical features for PCA
X = X.select_dtypes(include=[np.number])

# now recombine with y and drop na values
data = pd.concat([X, y], axis=1).dropna()
X = data.drop(columns=["free_agent_salary"])
y = data["free_agent_salary"]

In [12]:
# Compute correlation matrix of feature variables
corr_X = X.corr()

fig_corr_X = px.imshow(
    corr_X,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    aspect="auto",
    title="Correlation Matrix of Original Variables"
)
fig_corr_X.update_layout(
    width=900,
    height=900,
)
fig_corr_X.show()

In [13]:
pca_pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA())
])

pca_pipe.fit(X)

0,1,2
,steps,"[('scaler', ...), ('pca', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [14]:
pca_model = pca_pipe.named_steps["pca"]
explained_var = pca_model.explained_variance_ratio_
cum_explained_var = np.cumsum(explained_var)
n_components = len(explained_var)

ev_df = pd.DataFrame({
    "PC": np.arange(1, n_components + 1),
    "ExplainedVariance": explained_var,
    "CumulativeVariance": cum_explained_var
})


fig_scree = px.line(
    ev_df, x="PC", y="ExplainedVariance",
    markers=True,
    title="Scree Plot: Proportion of Variance Explained"
)
fig_scree.show()


In [15]:
X_pca_scores = pca_pipe.transform(X)
pc_cols = [f"PC{i}" for i in range(1, n_components + 1)]

scores_df = pd.DataFrame(X_pca_scores, columns=pc_cols)
scores_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34
0,-0.183527,-2.610567,1.091091,-0.052451,0.374542,0.342862,-0.420874,-1.210842,-1.382709,0.698758,...,0.323018,-0.098629,-0.031295,0.036868,0.014323,0.000392,-0.020214,-0.002112,0.0,0.0
1,-2.971102,-1.107376,0.459027,-0.455624,0.503014,-1.008379,-0.343752,0.618994,-1.276714,-0.642932,...,-0.030201,0.260238,-0.064040,-0.177682,0.040614,0.024751,-0.007027,-0.006560,0.0,0.0
2,-2.708784,-1.744936,-0.555759,1.438307,0.515899,-0.460219,0.283979,-0.486337,-0.573164,-0.759667,...,-0.295427,-0.059126,-0.007600,0.056018,-0.090632,0.016363,0.001316,-0.006937,0.0,0.0
3,5.533602,1.010180,0.120646,-0.446615,-0.034753,-1.292224,-0.442821,-0.451887,-0.607168,0.162524,...,-0.042687,-0.021618,-0.037782,0.280803,0.026303,-0.090185,0.013443,-0.002480,0.0,0.0
4,10.572390,1.459457,-1.316189,1.288417,3.288547,-3.065688,2.236827,1.595672,1.141796,-0.667393,...,0.456560,0.448810,-0.032462,-0.099576,0.043838,-0.017086,-0.054279,-0.002905,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,-0.798395,-0.487681,-2.383111,0.362603,-1.474171,0.910648,-0.700280,0.726837,-0.713260,0.130139,...,0.734031,0.172886,-0.050265,0.198410,0.024399,-0.072932,-0.010008,-0.000169,0.0,0.0
449,8.222262,1.228474,-3.368062,-1.322538,0.602355,0.552713,0.956151,-0.646540,0.435017,1.806122,...,-0.856106,-0.166246,0.016458,-0.053229,-0.079243,-0.052905,-0.012327,-0.003677,0.0,0.0
450,6.414393,1.001089,-4.094180,0.343200,1.695417,-0.354375,1.583658,0.162929,1.246125,0.935057,...,-0.089683,-0.060109,0.092635,0.116799,0.002185,0.039108,-0.010724,-0.010485,0.0,0.0
451,6.109944,2.568435,-3.542512,0.259515,-0.566150,-0.825792,-1.471578,-2.598573,3.470619,1.183786,...,-0.059248,0.455020,0.075669,0.208133,0.033284,-0.004114,-0.011808,0.000297,0.0,0.0


In [16]:
corr_PC = scores_df.corr()

fig_corr_PC = px.imshow(
    corr_PC,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    aspect="auto",
    title="Correlation Matrix of Principal Components"
)
fig_corr_PC.update_layout(
    width=800,
    height=800,
)
fig_corr_PC.show()

In [17]:
loadings = pca_model.components_.T[:, :2]
loading_df = pd.DataFrame(
    loadings,
    index=X.columns,
    columns=["PC1_loading", "PC2_loading"]
)

In [18]:
arrow_scale = 3

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=scores_df["PC1"],
    y=scores_df["PC2"],
    mode="markers+text",
    textposition="top center",
    name="pitchers",
    hoverinfo="text",
    #Add playerID from og dataframe
    hovertext=pitchers["playerID"],
    text=pitchers["playerID"]
))


for var_name, row in loading_df.iterrows():
    x_arrow = row["PC1_loading"] * arrow_scale
    y_arrow = row["PC2_loading"] * arrow_scale


    fig.add_trace(go.Scatter(
        x=[0, x_arrow],
        y=[0, y_arrow],
        mode="lines+markers+text",
        text=[None, var_name],
        textposition="top center",
        showlegend=False
    ))

fig.update_layout(
    title="PCA Biplot: PC1 vs PC2 with Variable Directions",
    xaxis_title="PC1",
    yaxis_title="PC2",
    xaxis=dict(zeroline=True),
    yaxis=dict(zeroline=True),
    width=800,
    height=700
)

fig.show()


In [19]:
# Now do PCA regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pcr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=4)),
    ("linreg", LinearRegression())
])


pcr_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('scaler', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,4
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [21]:
train_r2 = pcr_pipe.score(X_train, y_train)
test_r2  = pcr_pipe.score(X_test, y_test)


y_train_pred = pcr_pipe.predict(X_train)
y_test_pred  = pcr_pipe.predict(X_test)


In [22]:
train_rmse = np.sqrt(np.mean((y_train - y_train_pred)**2))
test_rmse  = np.sqrt(np.mean((y_test - y_test_pred)**2))

print(f"Using n_components = {4}")
print(f"Train R²  : {train_r2:.4f}")
print(f"Test  R²  : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test  RMSE: {test_rmse:.4f}")

Using n_components = 4
Train R²  : 0.5913
Test  R²  : 0.5610
Train RMSE: 2957157.1132
Test  RMSE: 2489621.8656
