# Lecture 19 - Lab: PLS
## CMSE 381 - Fall 2022
## Oct 28, 2022



In [None]:
# Everyone's favorite standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time

import seaborn as sns

# ML imports we've used previously
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA


# PLS on Hitters Data

# Loading in the data

Ok, here we go, let's play with a baseball data set again. Note this cleanup is all the same as the last lab. 

In [None]:
df = pd.read_csv('Hitters.csv').dropna().drop('Player', axis = 1)
df.info()
dummies = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

In [None]:
y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Define the feature set X.
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis = 1)

X.info()

In [None]:
# And here we have the normalized data.
X_normalized = X/X.std()
X_normalized.head()

# Principal Least Squares (PLS)

The command do do PLS in `Scikit-learn` is  `PLSRegression`. Below is a quick code that runs PLS on our dataset. 

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold

In [None]:
pls = PLSRegression(n_components=3)
pls.fit(X_normalized,y)
yhat = pls.predict(X_normalized)
mean_squared_error(y,yhat)

But like last time, we can also use the `cross_val_score` function to get the CV score easily. 

In [None]:
pls = PLSRegression(n_components=3)
scores = cross_val_score(pls, X_normalized, y, cv=10, scoring='neg_mean_squared_error')
scores.mean()

&#9989; **<font color=red>Do this:</font>**  Like last time, your job is to test a PLS model for an increasing number of components used. I recommend using the `cross_val_score` with `scoring='neg_mean_squared_error'`. What number of components would you use? 

In [None]:
n = len(X_normalized)
mse = []

# Calculate MSE using CV for an increasing number of components, 
# adding one component at a time.
for i in np.arange(1, 20): # i is the number of components to use each time
    # ====
    score = 0 # Your code to figure out the score each time goes in here. 
    # ====

    mse.append(score)
    
# Plot results    
plt.plot(mse, '-v')
plt.xlabel('Number of  components in regression')
plt.ylabel('MSE')
plt.title('Predicting Salary')
plt.xlim(xmin=-1);

&#9989; **<font color=red>Do this:</font>**  Below is my code from doing the PCR version on this data set from last class. Draw the two plots overlaid: The test MSE from doing PLS and that form doing PCR.  What do you notice? Which model would you pick with which number of components and why?

In [None]:
pca = PCA()
X_PCs = pca.fit_transform(X_normalized)

# 10-fold CV, with shuffle included. 
# You can just put in `cv=10` below, but this doesn't shuffle the data
n = len(X_normalized)
kf_10 = KFold( n_splits=10, shuffle=True, random_state=48864)

regr = LinearRegression()
msePCA = []

# Calculate MSE using CV for the 19 principal components, adding one component at the time.
for i in np.arange(1, 20):
    score = -1*cross_val_score(regr, X_PCs[:,:i], y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
    msePCA.append(score)


In [None]:
# Your plot code here



-----
### Congratulations, we're done!
Written by Dr. Liz Munch, Michigan State University

<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.