# Biol 359  |  Cross-Validation
### Spring 2021, Week 9

<hr style="border:2px solid gray"> </hr>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load diabetes dataset
diabetes_dataset = datasets.load_diabetes()
print(diabetes_dataset.DESCR)

In [None]:
# Generate scatter plot of independent vs dependent variable
plt.style.use('ggplot')
fig = plt.figure(figsize = (20, 20))
 
for index, feature_name in enumerate(diabetes_dataset.feature_names):
    ax = fig.add_subplot(4, 4, index + 1)
    ax.scatter(diabetes_dataset.data[:, index], diabetes_dataset.target)
    ax.set_ylabel('progression after 1yr', size = 16)
    ax.set_xlabel(feature_name, size = 16)
 
plt.show()

In [None]:
# Load the dataset into Pandas Dataframe
diabetes_pd = pd.DataFrame(diabetes_dataset.data)
diabetes_pd.columns = diabetes_dataset.feature_names
diabetes_pd_target = np.asarray(diabetes_dataset.target)
diabetes_pd['progression'] = pd.Series(diabetes_pd_target)
 
# input
# X = diabetes_pd.iloc[:, :-1]
X = diabetes_pd.loc[:, ['age', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']]
# output
Y = diabetes_pd.iloc[:, -1]
 
# print(diabetes_pd.head())

print("Input data:")
print(X)

print("")
print("Output data:")
print(Y)

In [None]:
# Split data into training and validation set
x_train, x_validation, y_train, y_validation = train_test_split(X, Y, test_size = 0.20)
 
print("Train data shape of X = % s and Y = % s : "%(x_train.shape, y_train.shape)) 
print("Validation data shape of X = % s and Y = % s : "%(x_validation.shape, y_validation.shape))

In [None]:
# Train multiple linear regression model
lreg = LinearRegression()
lreg.fit(x_train, y_train)

# Predict validation data
lreg_y_pred = lreg.predict(x_validation)

print("Linear regression coefficient of determination of prediction for training data")
print(f"{lreg.score(x_train, y_train):.3f}")

print("Linear regression coefficient of determination of prediction for validation data")
print(f"{lreg.score(x_validation, y_validation):.3f}")
 
mean_squared_error = np.mean((lreg_y_pred - y_validation)**2)
print("Mean squared Error on test set : ", mean_squared_error)
 

In [None]:
y_train
plt.plot(lreg_y_pred, y_validation, '.', label='validation')
plt.plot(lreg.predict(x_train), y_train, '.', label='train')
plt.xlim([0, 400])
plt.ylim([0, 400])
plt.legend()
plt.show()

In [None]:
# Combine coefficient and corresponding variable names
lreg_coefficient = pd.DataFrame()
lreg_coefficient["Columns"] = x_train.columns
lreg_coefficient['Coefficient Estimate'] = pd.Series(lreg.coef_)
lreg_coefficient

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(lreg_coefficient["Columns"], lreg_coefficient['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

In [None]:
# Train a Ridge regression model
from sklearn.linear_model import Ridge
 
ridgeR = Ridge(alpha = 1)
ridgeR.fit(x_train, y_train)

# Predict validation data
y_pred = ridgeR.predict(x_validation)
 
### Calculate R2
# r2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"R^2 = {r2}")

### Calculate Q2
# q2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"Q^2 = {q2}")

## calculate mean square error
## mean_squared_error_ridge = np.mean((y_pred - y_test)**2)
## print(mean_squared_error_ridge)


In [None]:
# Combine coefficient and corresponding variable names
ridge_coefficient = pd.DataFrame()
ridge_coefficient["Columns"]= x_train.columns
ridge_coefficient['Coefficient Estimate'] = pd.Series(ridgeR.coef_)
ridge_coefficient

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(ridge_coefficient["Columns"], ridge_coefficient['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

In [None]:
# Train a LASSO regression model
from sklearn.linear_model import Lasso
 
lasso = Lasso(alpha = 1)
lasso.fit(x_train, y_train)

# Predict validation data
y_pred1 = lasso.predict(x_validation)
 
### Calculate R2
# r2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"R^2 = {r2}")

### Calculate Q2
# q2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"Q^2 = {q2}")
    
## Calculate Mean Squared Error
## mean_squared_error = np.mean((y_pred1 - y_test)**2)
## print("Mean squared error on test set", mean_squared_error)


In [None]:
# Combine coefficient and corresponding variable names
lasso_coeff = pd.DataFrame()
lasso_coeff["Columns"] = x_train.columns
lasso_coeff['Coefficient Estimate'] = pd.Series(lasso.coef_)
 
lasso_coeff

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(lasso_coeff["Columns"], lasso_coeff['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()

In [None]:
# Train and Elastic Net model
from sklearn.linear_model import ElasticNet
 
e_net = ElasticNet(alpha = 1)
e_net.fit(x_train, y_train)
 
# Predict validation data
y_pred_elastic = e_net.predict(x_validation)

### Calculate R2
# r2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"R^2 = {r2}")

### Calculate Q2
# q2 = lm_elasticnet.score(x_validation, y_validation)
# print(f"Q^2 = {q2}")

## mean_squared_error = np.mean((y_pred_elastic - y_test)**2)
## print("Mean Squared Error on test set", mean_squared_error)
 


In [None]:
# Combine coefficient and corresponding variable names
e_net_coeff = pd.DataFrame()
e_net_coeff["Columns"] = x_train.columns
e_net_coeff['Coefficient Estimate'] = pd.Series(e_net.coef_)
e_net_coeff

In [None]:
# Plot the magnitude of coefficients
fig, ax = plt.subplots(figsize =(7, 7))
color =['midnightblue', 'navy', 'darkblue', 'mediumblue', 'blue', 
        'darkslateblue','mediumslateblue', 'slateblue', 
        'mediumpurple', 'blueviolet', 'darkviolet']

ax.bar(e_net_coeff["Columns"], e_net_coeff['Coefficient Estimate'], color = color)
 
ax.spines['bottom'].set_position('zero')
 
plt.style.use('ggplot')
plt.show()