Part 5: Write a program to perform K-fold cross-validation for the three models in #4

Resources: 
1. https://www.statology.org/k-fold-cross-validation-in-python/
2. https://www.pluralsight.com/guides/linear-lasso-ridge-regression-scikit-learn



In [9]:
#Lasso + K-fold 
from matplotlib import test
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Lasso

#loading data, splitting data 
gas_data = pd.read_csv('/Users/maddie/Projects/CPSC_483/assignmnet_1/Data1.csv') 
target_column = ['Idx'] 
predictors = list(set(list(gas_data.columns))-set(target_column))
gas_data[predictors] = gas_data[predictors]/gas_data[predictors].max()
gas_data.describe()

X = gas_data[predictors].values
y = gas_data[target_column].values

#setting training and testing data size 
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=40)

#Lasso regression function 
model_lasso = Lasso(alpha=0.05)
model_lasso.fit(train_x, train_y) 
pred_train_lasso= model_lasso.predict(train_x)
print('Training RMSE: ' ,  np.sqrt(mean_squared_error(train_y,pred_train_lasso)))
print('Training R^2 score:' , r2_score(train_y, pred_train_lasso))

pred_test_lasso= model_lasso.predict(test_x)
print('Testing RMSE:' , np.sqrt(mean_squared_error(test_y,pred_test_lasso))) 
print('Test R^2 score:',r2_score(test_y, pred_test_lasso))

#k-fold analysis
cv = KFold(n_splits=10, random_state=1, shuffle=True)
model = Lasso()
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

#view mean absolute error
print ('Mean absolute score via k-fold validation: ', np.mean(np.absolute(scores)))



Training RMSE:  0.2516098043115337
Training R^2 score: 0.0
Testing RMSE: 0.25266705313781657
Test R^2 score: -0.00011179930744065203
Mean absolute score via k-fold validation:  0.2049063935895618


In [12]:
#Ridge + K-fold
from matplotlib import test
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge

#loading data, splitting data 
gas_data = pd.read_csv('/Users/maddie/Projects/CPSC_483/assignmnet_1/Data1.csv') 
target_column = ['Idx'] 
predictors = list(set(list(gas_data.columns))-set(target_column))
gas_data[predictors] = gas_data[predictors]/gas_data[predictors].max()
gas_data.describe()

X = gas_data[predictors].values
y = gas_data[target_column].values

#setting training and testing data size
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=40)

#Ridge regression function 
rr = Ridge(alpha=0.01)
rr.fit(train_x, train_y) 
pred_train_rr= rr.predict(train_x)
print('Training RMSE: ' , np.sqrt(mean_squared_error(train_y,pred_train_rr))) 
print('Training R^2 score:' , r2_score(train_y, pred_train_rr))

pred_test_rr= rr.predict(test_x)
print('Testing RMSE:', np.sqrt(mean_squared_error(test_y,pred_test_rr))) 
print('Test R^2 score:', r2_score(test_y, pred_test_rr))

#k-fold analysis
cv = KFold(n_splits=10, random_state=1, shuffle=True)
model = Ridge()
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

#view mean absolute error
print ('Mean absolute score via k-fold validation: ', np.mean(np.absolute(scores)))

Training RMSE:  0.1362642586779505
Training R^2 score: 0.7067022064901793
Testing RMSE: 0.13653883403073622
Test R^2 score: 0.7079456356939777
Mean absolute score via k-fold validation:  0.10946677822792561


In [13]:
#Elastic Net + K-fold
from matplotlib import test
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import ElasticNet

#loading data, splitting data 
gas_data = pd.read_csv('/Users/maddie/Projects/CPSC_483/assignmnet_1/Data1.csv') 
target_column = ['Idx'] 
predictors = list(set(list(gas_data.columns))-set(target_column))
gas_data[predictors] = gas_data[predictors]/gas_data[predictors].max()
gas_data.describe()

X = gas_data[predictors].values
y = gas_data[target_column].values

#setting training and testing data size
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.20, random_state=40)

#Elastic Net Function
model_enet = ElasticNet(alpha = 0.05)
model_enet.fit(train_x, train_y) 
pred_train_enet= model_enet.predict(train_x)
print('Training RMSE: ', np.sqrt(mean_squared_error(train_y,pred_train_enet)))
print('Training R^2 score:' , r2_score(train_y, pred_train_enet))

pred_test_enet= model_enet.predict(test_x)
print('Testing RMSE:', np.sqrt(mean_squared_error(test_y,pred_test_enet)))
print('Test R^2 score:', r2_score(test_y, pred_test_enet))

#k-fold analysis
cv = KFold(n_splits=10, random_state=1, shuffle=True)
model = ElasticNet()
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

#view mean absolute error
print ('Mean absolute score via k-fold validation: ', np.mean(np.absolute(scores)))

Training RMSE:  0.2516098043115337
Training R^2 score: 0.0
Testing RMSE: 0.25266705313781657
Test R^2 score: -0.00011179930744065203
Mean absolute score via k-fold validation:  0.2049063935895618
