# Biol 359  |  Cross-Validation
### Spring 2021, Week 9

<hr style="border:2px solid gray"> </hr>


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import sklearn as sk
import urllib.request
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)}) 
sns.set_style("whitegrid",  {'axes.linewidth': 2, 'axes.edgecolor':'black'})

from sklearn.datasets import load_breast_cancer
# NOTE:
# `breast_raw.data`: Stores the raw data (breast feature data)
# `breast_raw.feature_names`: Stores the raw data feature labels
# `breast_raw.target`: Stores the tumor type (0 = 'benign', 1 = 'malignant')
# `breast_raw.target_names`: Stores the tumor type labels ('benign' or 'malignant')
# `breast_raw.DESCR`: Description of the data
breast_raw = load_breast_cancer()

# Uncomment the following line to print a description of the data
# print(breast_raw.DESCR)

In [None]:
# Feature data set
features = pd.DataFrame(breast_raw.data, columns=breast_raw.feature_names)
features.head()

In [None]:
# Tumor label data set
tumor = pd.DataFrame(breast_raw.target, columns=['tumor'])
# tumor_set.replace({'tumor type': {0: 'benign', 1: 'malignant'}}, inplace=True)
tumor.head()

In [None]:
# Concantenate into one data frame
breast = pd.concat([features, tumor], axis=1)
# breast.loc[:, breast.columns != 'tumor'].head()
# breast.loc[:, breast.columns == 'tumor'].head()

features.describe()

In [None]:
def polynomial_feature_example(x, y, regularization = None, reg_alpha=1, degrees=6):
    """
    Perform regularization on a polynomial feature set. 
    """
    poly_transform = PolynomialFeatures(degree=degrees, include_bias = False)
    x_poly = poly_transform.fit_transform(x.reshape(-1,1))
    
    #Regularization techniques need to be scaled in order to work properly
    x_scaler = StandardScaler().fit(x_poly)
    y_scaler = StandardScaler().fit(y.reshape(-1,1))
    x_poly_z = x_scaler.transform(x_poly)
    y_z = y_scaler.transform(y.reshape(-1,1))
    
    #Code to perform the model fitting and parameter estimation
    if regularization is None:
        #Least Squares problem
        plt.suptitle('Linear Regression', fontsize=20, fontweight='bold')
        lm_poly = linear_model.LinearRegression(fit_intercept=True)
        lm_poly.fit(x_poly_z,y_z)
        
    elif regularization is 'L1':
        #LASSO problem
        plt.suptitle('LASSO', fontsize=20, fontweight='bold')       
        lm_poly = linear_model.Lasso(alpha = reg_alpha, max_iter=1e8, fit_intercept=True)
        lm_poly.fit(x_poly_z,y_z)    
        
    elif regularization is 'L2':
        #ridge problem
        plt.suptitle('Ridge', fontsize=20, fontweight='bold')
        lm_poly = linear_model.Ridge(alpha = reg_alpha, max_iter=1e5, fit_intercept=True)
        lm_poly.fit(x_poly_z,y_z)
        
    x_model = np.linspace(min(x), max(x), 150).reshape(-1,1)
    x_model_transform = poly_transform.fit_transform(x_model)
    x_model_transform_z = x_scaler.transform(x_model_transform)
    
    
    y_model = lm_poly.predict(x_model_transform_z)*y_scaler.scale_ + y_scaler.mean_
    
    #********************************************************************************
    # Coefficients from scaled model can be transformed back into original units
    # This code is outside the scope of this class and can be ignored. 
    
    unscaled_coefficients = (lm_poly.coef_ * y_scaler.scale_ / x_scaler.scale_).flatten()
    
    poly_terms = [r'$({0:.3f})x ^ {{{1}}}$'.format(coef, i+1) for i, coef in enumerate(unscaled_coefficients)
                 if coef != 0]
    
    unscaled_intercept = lm_poly.intercept_*y_scaler.scale_ + y_scaler.mean_ \
                            - sum(unscaled_coefficients*x_scaler.mean_)
        
    intercept_str = r'${0:.1f} + $'.format(unscaled_intercept[0])
    title =  intercept_str + r'$+$'.join(poly_terms)
    #********************************************************************************
    
    plot_model(x_data, y_data, x_model, y_model, title=title)
    

#### Define response variable.

In [None]:
y_data = 

#### Define explanatory variables.

In [None]:
x_data = 

#### Define training and validation data set.

#### SLR. Identify a single variable to predict outcomes, calculate R2, calculate Q2.

#### MRL with Ordinary Least Squares optimization, calculate R2, calculate Q2.

In [None]:
polynomial_feature_example(x_data, y_data, degrees=1)

#### MLR with LASSO regularization, calculate R2, calculate Q2.

In [None]:
polynomial_feature_example(x_data, y_data, regularization='L1', reg_alpha=0.01)

#### MLR with Ridge regularization, calculate R2, calculate Q2.

In [None]:
polynomial_feature_example(x_data, y_data, regularization='L2', reg_alpha = 0.01)

#### MRL with Elastic Net regularization, calculate R2, calculate Q2.