# Assignment 7: Regression and Regularization

In assignment 7 we identify the principal components in a dataset that tracked patients with Parkinson's Disease.

### Preparation

Import required libraries:

In [50]:
# Import required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

Define global options:

In [21]:
# Display plots inline
%matplotlib inline

# Display all outputs from cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Declare functions:

In [54]:
# Perform scaling of column
def scale(col):
    mean_col = np.mean(col)
    sd_col = np.std(col)
    std = (col - mean_col) / sd_col
    return std

### Load the Data Set

We begin by loading the data set containing our dataset of patients with Parkinson's Disease:

In [58]:
# Internet location of the data set
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data'

# Download the data into a dataframe object
parkinsons_data = pd.read_csv(url)

parkinsons_data

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,0.000034,0.00401,0.00317,...,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,1,72,0,12.6660,28.447,34.894,0.00300,0.000017,0.00132,0.00150,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,1,72,0,19.6810,28.695,35.389,0.00481,0.000025,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.6470,28.905,35.810,0.00528,0.000027,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,1,72,0,33.6420,29.187,36.375,0.00335,0.000020,0.00093,0.00130,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,42,61,0,142.7900,22.485,33.485,0.00406,0.000031,0.00167,0.00168,...,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,42,61,0,149.8400,21.988,32.988,0.00297,0.000025,0.00119,0.00147,...,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,42,61,0,156.8200,21.495,32.495,0.00349,0.000025,0.00152,0.00187,...,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,42,61,0,163.7300,21.007,32.007,0.00281,0.000020,0.00128,0.00151,...,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


### Create the Feature Matrix

Create feature matrix for the features to examine, treating the *sex* column as *is_female*:

In [59]:
# Extract features
select_cols = ['age','sex','motor_UPDRS','total_UPDRS',
               'Jitter(%)','Jitter(Abs)','Jitter:RAP','Jitter:PPQ5','Jitter:DDP',
               'Shimmer','Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','Shimmer:APQ11','Shimmer:DDA',
               'NHR','HNR','RPDE','DFA','PPE']

# Populate feature array, treat 'sex' column as 'is_female'
X = parkinsons_data.loc[:,select_cols]

# Display raw features
X

Unnamed: 0,age,sex,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,28.199,34.398,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006
1,72,0,28.447,34.894,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810
2,72,0,28.695,35.389,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014
3,72,0,28.905,35.810,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277
4,72,0,29.187,36.375,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,0,22.485,33.485,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367
5871,61,0,21.988,32.988,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621
5872,61,0,21.495,32.495,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157
5873,61,0,21.007,32.007,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204


### Scale the Feature Matrix

Scale features:

In [60]:
# Create new scaled DataFrame
scaled_data = pd.DataFrame()

# Scale the feature columns for accurate comparison
for col in select_cols:
    scaled_data.loc[:,col] = scale(parkinsons_data.loc[:,col])

# Display scaled data
scaled_data

Unnamed: 0,age,sex,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,0.815695,-0.682509,0.849197,0.502745,0.082905,-0.284242,0.327453,-0.028637,0.328505,-0.324594,-0.351642,-0.209709,-0.423356,-0.543466,-0.209704,-0.298721,-0.009205,-1.214066,-1.478500,-0.650658
1,0.815695,-0.682509,0.879706,0.549103,-0.560793,-0.756723,-0.533746,-0.476212,-0.534825,-0.534016,-0.573156,-0.545158,-0.565592,-0.529955,-0.545153,-0.351965,1.282650,-1.055119,-1.247880,-1.218585
2,0.815695,-0.682509,0.910216,0.595367,-0.238944,-0.539382,-0.300038,-0.320767,-0.298983,-0.669115,-0.564469,-0.741592,-0.702426,-0.645545,-0.741587,-0.199370,0.318711,-0.784860,-1.540139,-0.103280
3,0.815695,-0.682509,0.936051,0.634716,-0.155370,-0.485186,-0.344859,-0.170682,-0.344871,-0.423692,0.069668,-0.460540,-0.449763,-0.392849,-0.460787,-0.071754,0.644530,-0.536487,-1.062115,1.237075
4,0.815695,-0.682509,0.970743,0.687522,-0.498557,-0.663894,-0.658604,-0.529814,-0.659682,-0.658276,-0.586186,-0.783145,-0.651413,-0.464905,-0.783392,-0.343370,1.036305,-0.689195,-1.297953,-0.283954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,-0.431361,-0.682509,0.146246,0.417413,-0.372307,-0.358449,-0.421694,-0.427970,-0.422773,-0.583565,-0.655681,-0.561024,-0.528983,-0.600010,-0.560767,-0.116990,0.160696,0.997028,-1.411923,-0.064697
5871,-0.431361,-0.682509,0.085104,0.370962,-0.566128,-0.537436,-0.575365,-0.484252,-0.574309,-0.421369,-0.416793,-0.501338,-0.442561,-0.422372,-0.501081,-0.338311,0.281189,-0.153430,-1.242097,-1.020641
5872,-0.431361,-0.682509,0.024453,0.324884,-0.473663,-0.537158,-0.469717,-0.377049,-0.469728,-0.350143,-0.290834,-0.260328,-0.335135,-0.435882,-0.260576,-0.241573,0.789028,-0.629379,-1.048856,-0.852755
5873,-0.431361,-0.682509,-0.035582,0.279274,-0.594579,-0.658336,-0.546552,-0.473532,-0.547630,-0.743051,-0.781639,-0.772568,-0.686822,-0.721104,-0.772815,-0.404372,0.639170,0.269141,-1.269038,-0.847618


### Create and Plot the Principal Components

Compute the covariance matrix of the features (the design matrix):

In [61]:
# Compute covariance matrix of features
C = np.dot(np.transpose(scaled_data),scaled_data)
C = np.multiply(1.0/float(scaled_data.shape[0]), C)
np.round(C,2)

array([[ 1.  , -0.04,  0.27,  0.31,  0.02,  0.04,  0.01,  0.01,  0.01,  0.1 ,  0.11,  0.1 ,  0.09,  0.14,  0.1 ,  0.01, -0.1 ,  0.09, -0.09,  0.12],
       [-0.04,  1.  , -0.03, -0.1 ,  0.05, -0.15,  0.08,  0.09,  0.08,  0.06,  0.06,  0.04,  0.06,  0.02,  0.04,  0.17, -0.  , -0.16, -0.17, -0.1 ],
       [ 0.27, -0.03,  1.  ,  0.95,  0.08,  0.05,  0.07,  0.08,  0.07,  0.1 ,  0.11,  0.08,  0.09,  0.14,  0.08,  0.07, -0.16,  0.13, -0.12,  0.16],
       [ 0.31, -0.1 ,  0.95,  1.  ,  0.07,  0.07,  0.06,  0.06,  0.06,  0.09,  0.1 ,  0.08,  0.08,  0.12,  0.08,  0.06, -0.16,  0.16, -0.11,  0.16],
       [ 0.02,  0.05,  0.08,  0.07,  1.  ,  0.87,  0.98,  0.97,  0.98,  0.71,  0.72,  0.66,  0.69,  0.65,  0.66,  0.83, -0.68,  0.43,  0.23,  0.72],
       [ 0.04, -0.15,  0.05,  0.07,  0.87,  1.  ,  0.84,  0.79,  0.84,  0.65,  0.66,  0.62,  0.62,  0.59,  0.62,  0.7 , -0.71,  0.55,  0.35,  0.79],
       [ 0.01,  0.08,  0.07,  0.06,  0.98,  0.84,  1.  ,  0.95,  1.  ,  0.68,  0.69,  0.65,  0.66,  0.6 , 

### SVD-Linear Regression with Singular Value Decomposition

TODO

### Report

Report on: Number of components before and after (choose a smaller number of principal components and explain how you chose them). Interpret the adjusted R-squared.