# Biol 359  |  Cross-Validation
### Spring 2021, Week 9

<hr style="border:2px solid gray"> </hr>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load microarray dataset

dataset = pd.read_csv('microarray_data_clean2.csv')
input_data = dataset.iloc[:, 4:]
output_data = dataset.iloc[:, 3]
conditions_data = dataset.iloc[:,0]

#input_data
#output_data
#conditions_data
dataset.head(10)


In [None]:
# Generate scatter plot of (first 25) independent vs dependent variable
plt.style.use('ggplot')
fig = plt.figure(figsize = (20, 20))
 
for index, feature_name in enumerate(input_data.columns):
    if index >= 25:
        break
    ax = fig.add_subplot(5, 5, index + 1)
    ax.scatter(input_data.iloc[:, index], output_data) 
    ax.set_ylabel('viability [24hr]', size = 16)
    ax.set_xlabel(feature_name, size = 16)
 
plt.show()

In [None]:
# Define X & Y variables
X = input_data
Y = output_data


In [None]:
# Split data into training and validation set
x_train, x_validation, y_train, y_validation = train_test_split(X, Y, test_size = 0.20)
 
print("Shape of training data X = % s and Y = % s : "%(x_train.shape, y_train.shape)) 
print("Shape of validation data X = % s and Y = % s : "%(x_validation.shape, y_validation.shape))

### (1) Multiple Linear Regression (Ordinary Least Squares)

In [None]:
# Train multiple linear regression model
lreg = LinearRegression()
lreg.fit(x_train, y_train)

# Predict validation data
lreg_y_pred = lreg.predict(x_validation)

print("Linear regression R2 (training data):",f"{lreg.score(x_train, y_train):.3f}")
print("Linear regression Q2 (validation data):",f"{lreg.score(x_validation, y_validation):.3f}" "\n")

sum_squared_error = round(np.sum((lreg.predict(x_train) - y_train)**2),3)
print("SSE on training data: ", sum_squared_error)
 
sum_squared_error = round(np.sum((lreg.predict(x_validation) - y_validation)**2),3)
print("SSE on validation data: ", sum_squared_error)
 

In [None]:
# Parity plot

plt.plot(lreg_y_pred, y_validation, '.', label='validation')
plt.plot(lreg.predict(x_train), y_train, '.', label='training')

min_value = min(min(lreg_y_pred), min(y_validation), min(lreg.predict(x_train)), min(y_train))
max_value = max(max(lreg_y_pred), max(y_validation), max(lreg.predict(x_train)), max(y_train))
plt.plot([min_value, max_value],[min_value, max_value], '--', label="parity")
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.set_box_aspect(1)
sns.despine()
plt.title('Parity Plot')
plt.legend(loc='best')
plt.show()


In [None]:
# Combine coefficient and corresponding variable names
lreg_coefficient = pd.DataFrame()
lreg_coefficient["Columns"] = x_train.columns
lreg_coefficient['Coefficient Estimate'] = pd.Series(lreg.coef_)
#lreg_coefficient

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(lreg_coefficient["Columns"], lreg_coefficient['Coefficient Estimate'], color = color)
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

### (2) Ridge Regression 

In [None]:
# Train a Ridge regression model
from sklearn.linear_model import Ridge
 
ridgeR = Ridge(alpha = 0.2)
ridgeR.fit(x_train, y_train)

# Predict validation data
ridge_y_pred = ridgeR.predict(x_validation)
 
print("Ridge regression R2 (training data):",f"{ridgeR.score(x_train, y_train):.3f}")
print("Ridge regression Q2 (validation data):",f"{ridgeR.score(x_validation, y_validation):.3f}" "\n")

sum_squared_error = round(np.sum((ridgeR.predict(x_train) - y_train)**2),3)
print("SSE on training data: ", sum_squared_error)
 
sum_squared_error = round(np.sum((ridgeR.predict(x_validation) - y_validation)**2),3)
print("SSE on validation data: ", sum_squared_error)
 


In [None]:
# Parity plot

plt.plot(ridgeR.predict(x_validation), y_validation, '.', label='validation')
plt.plot(ridgeR.predict(x_train), y_train, '.', label='train')

min_value = min(min(ridgeR.predict(x_validation)), min(y_validation), min(ridgeR.predict(x_train)), min(y_train))
max_value = max(max(ridgeR.predict(x_validation)), max(y_validation), max(ridgeR.predict(x_train)), max(y_train))
plt.plot([min_value, max_value],[min_value, max_value], '--', label="parity")
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.set_box_aspect(1)
sns.despine()
plt.title('Parity Plot')
plt.legend(loc='best')
plt.show()


In [None]:
# Combine coefficient and corresponding variable names
ridge_coefficient = pd.DataFrame()
ridge_coefficient["Columns"]= x_train.columns
ridge_coefficient['Coefficient Estimate'] = pd.Series(ridgeR.coef_)
#ridge_coefficient

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(ridge_coefficient["Columns"], ridge_coefficient['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

### (3) LASSO Regression 

In [None]:
# Train a LASSO regression model
from sklearn.linear_model import Lasso
 
lasso = Lasso(alpha = 0.2)
lasso.fit(x_train, y_train)

# Predict validation data
lasso_y_pred = lasso.predict(x_validation)
 
print("LASSO regression R2 (training data):",f"{lasso.score(x_train, y_train):.3f}")
print("LASSO regression Q2 (validation data):",f"{lasso.score(x_validation, y_validation):.3f}" "\n")

sum_squared_error = round(np.sum((lasso.predict(x_train) - y_train)**2),3)
print("SSE on training data: ", sum_squared_error)
 
sum_squared_error = round(np.sum((lasso.predict(x_validation) - y_validation)**2),3)
print("SSE on validation data: ", sum_squared_error)

In [None]:
# Parity plot

plt.plot(lasso.predict(x_validation), y_validation, '.', label='validation')
plt.plot(lasso.predict(x_train), y_train, '.', label='train')

min_value = min(min(lasso.predict(x_validation)), min(y_validation), min(lasso.predict(x_train)), min(y_train))
max_value = max(max(lasso.predict(x_validation)), max(y_validation), max(lasso.predict(x_train)), max(y_train))
plt.plot([min_value, max_value],[min_value, max_value], '--', label="parity")
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.set_box_aspect(1)
sns.despine()
plt.title('Parity Plot')
plt.legend(loc='best')
plt.show()


In [None]:
# Combine coefficient and corresponding variable names
lasso_coeff = pd.DataFrame()
lasso_coeff["Columns"] = x_train.columns
lasso_coeff['Coefficient Estimate'] = pd.Series(lasso.coef_)
 
#lasso_coeff

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(lasso_coeff["Columns"], lasso_coeff['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

### (4) Elastic Net Regression 

In [None]:
# Train and Elastic Net model
from sklearn.linear_model import ElasticNet
 
e_net = ElasticNet(alpha = 0.2)
e_net.fit(x_train, y_train)
 
# Predict validation data
elastic_y_pred = e_net.predict(x_validation)

print("Elastic net regression R2 (training data):",f"{e_net.score(x_train, y_train):.3f}")
print("Elastic net regression Q2 (validation data):",f"{e_net.score(x_validation, y_validation):.3f}" "\n")

sum_squared_error = round(np.sum((e_net.predict(x_train) - y_train)**2),3)
print("SSE on training data: ", sum_squared_error)
 
sum_squared_error = round(np.sum((e_net.predict(x_validation) - y_validation)**2),3)
print("SSE on validation data: ", sum_squared_error)

In [None]:
# Parity plot

plt.plot(e_net.predict(x_validation), y_validation, '.', label='validation')
plt.plot(e_net.predict(x_train), y_train, '.', label='train')

min_value = min(min(e_net.predict(x_validation)), min(y_validation), min(e_net.predict(x_train)), min(y_train))
max_value = max(max(e_net.predict(x_validation)), max(y_validation), max(e_net.predict(x_train)), max(y_train))
plt.plot([min_value, max_value],[min_value, max_value], '--', label="parity")
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
ax.set_box_aspect(1)
sns.despine()
plt.title('Parity Plot')
plt.legend(loc='best')
plt.show()


In [None]:
# Combine coefficient and corresponding variable names
e_net_coeff = pd.DataFrame()
e_net_coeff["Columns"] = x_train.columns
e_net_coeff['Coefficient Estimate'] = pd.Series(e_net.coef_)
#e_net_coeff

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(e_net_coeff["Columns"], e_net_coeff['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

### (5) Compare model architectures

In [None]:
from sklearn.model_selection import cross_val_score
k = 5
reg_alpha = 0.2

lm_lreg = LinearRegression()
scores_lreg = cross_val_score(lm_lreg, X, Y, cv=k)
np.mean(scores_lreg)
print("MLR k-fold cross-validation Q2:",f"{np.mean(scores_lreg):.3f}" "\n")

lm_ridge = linear_model.Ridge(alpha=reg_alpha)
scores_ridge = cross_val_score(lm_ridge, X, Y, cv=k)
np.mean(scores_ridge)
print("Ridge k-fold cross-validation Q2:",f"{np.mean(scores_ridge):.3f}" "\n")

lm_lasso = linear_model.Lasso(alpha=reg_alpha)
scores_lasso = cross_val_score(lm_lasso, X, Y, cv=k)
np.mean(scores_lasso)
print("LASSO k-fold cross-validation Q2:",f"{np.mean(scores_lasso):.3f}" "\n")

lm_en = linear_model.ElasticNet(alpha=reg_alpha)
scores_en = cross_val_score(lm_en, X, Y, cv=k)
np.mean(scores_en)
print("Elastic Net k-fold cross-validation Q2:",f"{np.mean(scores_en):.3f}" "\n")



### (6) Observe impact of regularization strength 

In [None]:
for alpha in [0,0.01,0.02,0.04, 0.06, 0.08, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8]:
    lm_ridge = linear_model.Ridge(alpha=alpha)
    scores_ridge = cross_val_score(lm_ridge, X, Y, cv=k)
    mean_ridge = round(np.mean(scores_ridge),2)
    print(f"Alpha = {alpha} results in average Ridge Q2 = {mean_ridge}")

In [None]:
for alpha in [0,0.01,0.02,0.04, 0.06, 0.08, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8]:
    lm_lasso = linear_model.Lasso(alpha=alpha)
    scores_lasso = cross_val_score(lm_lasso, X, Y, cv=k)
    mean_lasso = round(np.mean(scores_lasso),2)
    print(f"Alpha = {alpha} results in average LASSO Q2 = {mean_lasso}")

In [None]:
for alpha in [0,0.01,0.02,0.04, 0.06, 0.08, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 2, 4, 6, 8]:
    lm_en = linear_model.ElasticNet(alpha=alpha)
    scores_en = cross_val_score(lm_en, X, Y, cv=k)
    mean_en = round(np.mean(scores_en),2)
    print(f"Alpha = {alpha} results in average EN Q2 = {mean_en}")