# Data Mining II Final Project (LOOCV Version)

# 1. In this project, I look to look at the effect health measures (depression, happiness, physical health, and health) plays towards the out-of-sample predictive accuracy of several models.
# 2. Furthermore, I use kernel PCA on the four health measures as a 3rd option-- in addition to the other two models.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

# Creating Train/Test Sets for All Measures, No Health Measures, and PCA

In [2]:
combined_data = pd.read_csv("combined_data3.csv")
combined_data = combined_data.iloc[:, 1:]

In [3]:
from sklearn.model_selection import train_test_split

#Extract the features and target variable
X = combined_data.drop(columns=['Vote16'])
y = combined_data['Vote16']

np.random.seed(1337)

#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1337)

#For the non-health measure classifications
X_train_cut = X_train[["Marital", "Age", "Educ", "Sex", "Race", "Income", "WrkGovt", "Party"]]
X_cut = X[["Marital", "Age", "Educ", "Sex", "Race", "Income", "WrkGovt", "Party"]]

In [4]:
import pandas as pd
from sklearn.decomposition import KernelPCA
import matplotlib.pyplot as plt

pca_variables = combined_data[['Depression', 'Happy', 'Physhlth', 'Health']]

kpca = KernelPCA(kernel = 'rbf', gamma = 0.1)
transformed_data = kpca.fit_transform(pca_variables)

#Create transformed DF from PCA
pca_variables = pd.DataFrame(transformed_data)

pca_variables = pd.concat([combined_data[["Vote16", "Marital", "Age", "Educ", "Sex", "Race", "Income", "WrkGovt", "Party"]], pca_variables], axis=1)
X_pca = pca_variables.drop(columns = ['Vote16'])
y_pca = pca_variables['Vote16']

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y_pca, test_size=0.10, random_state=1337)

# SVM Classification

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

np.random.seed(1337)

#All Measures

#Define parameter grid for tuning
param_grid = {'C': [10**i for i in range(-1, 3)], 'gamma': [0.1, 1, 5]}

#Instantiate SVM classifier
svm = SVC(kernel = 'rbf')

#Perform grid search cross-validation
grid_search = GridSearchCV(svm, param_grid, cv = 20)
grid_search.fit(X_train, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("All Measures Accuracy:", accuracy)

All Measures Accuracy: 0.7453151618398637


In [6]:
#Without Health Measures 

np.random.seed(1337)

#Define parameter grid for tuning
param_grid = {'C': [10**i for i in range(-1, 3)], 'gamma': [0.1, 1, 5]}

#Instantiate SVM classifier
svm = SVC(kernel = 'rbf')

#Perform grid search cross-validation
grid_search = GridSearchCV(svm, param_grid, cv = 20)
grid_search.fit(X_train_cut, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_cut, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("No Health Measures Accuracy:", accuracy)

No Health Measures Accuracy: 0.7534071550255537


In [7]:
#All Measures With RBF Kernel PCA

np.random.seed(1337)

#Define parameter grid for tuning
param_grid = {'C': [10**i for i in range(-1, 3)], 'gamma': [0.1, 1, 5]}

#Instantiate SVM classifier
svm = SVC(kernel = 'rbf')

#Perform grid search cross-validation
grid_search = GridSearchCV(svm, param_grid, cv = 20)
grid_search.fit(X_train_pca, y_train_pca)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_pca, y_pca, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("PCA Measure Accuracy:", accuracy)

PCA Measure Accuracy: 0.7512776831345827


# XGBoost

In [8]:
import xgboost as xgb

np.random.seed(1337)

#With All Measures

#Set Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

#Instantiate XGB
xgb = xgb.XGBClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("All Measures Accuracy:", accuracy)

All Measures Accuracy: 0.7632027257240205


In [9]:

np.random.seed(1337)

#Without Health Measures

#Set Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

#Instantiate XGB
xgb = xgb.XGBClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_cut, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_cut, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("No Health Measures Accuracy:", accuracy)

No Health Measures Accuracy: 0.762350936967632


In [10]:

np.random.seed(1337)

#All Measures with Linear Kernel PCA

#Set Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

#Instantiate XGB
xgb = xgb.XGBClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_pca, y_train_pca)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_pca, y_pca, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("PCA Measure Accuracy:", accuracy)

PCA Measure Accuracy: 0.7546848381601363


# Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(1337)

#All Measures

#Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

#Instantiate Random Forest
rf = RandomForestClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("All Measures Accuracy:", accuracy)

All Measures Accuracy: 0.7453151618398637


In [12]:
np.random.seed(1337)

#No Health Measures

#Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

#Instantiate Random Forest
rf = RandomForestClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_cut, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_cut, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("No Health Measures Accuracy:", accuracy)

No Health Measures Accuracy: 0.7487223168654173


In [13]:
np.random.seed(1337)

#All Measures with Linear Kernel PCA

#Search Params
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

#Instantiate Random Forest
rf = RandomForestClassifier()

#Perform grid search
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_pca, y_train_pca)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_pca, y_pca, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("PCA Measure Accuracy:", accuracy)

PCA Measure Accuracy: 0.6631175468483816


# Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

np.random.seed(1337)

#All Measures

#Search Params
param_grid = {
    'C': [0.1, 1.0, 10.0],  
    'penalty': ['l1', 'l2']
}

#Instantiate Logistic Regression
lr = LogisticRegression(max_iter = 1000)

#Perform grid search
grid_search = GridSearchCV(estimator = lr, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("All Measures Accuracy:", accuracy)

All Measures Accuracy: 0.7372231686541738


In [15]:
np.random.seed(1337)

#No Health Measures

#Search Params
param_grid = {
    'C': [0.1, 1.0, 10.0],  
    'penalty': ['l1', 'l2']
}

#Instantiate Logistic Regression
lr = LogisticRegression(max_iter = 1000)

#Perform grid search
grid_search = GridSearchCV(estimator = lr, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_cut, y_train)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_cut, y, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("No Health Measures Accuracy:", accuracy)

No Health Measures Accuracy: 0.7389267461669506


In [16]:
np.random.seed(1337)

#All Measures with Linear Kernel PCA

#Search Params
param_grid = {
    'C': [0.1, 1.0, 10.0],  
    'penalty': ['l1', 'l2']
}

#Instantiate Logistic Regression
lr = LogisticRegression(max_iter = 1000)

#Perform grid search
grid_search = GridSearchCV(estimator = lr, param_grid = param_grid, cv = 20, scoring = 'accuracy')
grid_search.fit(X_train_pca, y_train_pca)

#Select Best Model Found
best_model = grid_search.best_estimator_

#Define cross-validation method to use
cv = LeaveOneOut()

#use LOOCV to evaluate model
scores = cross_val_score(best_model, X_pca, y_pca, scoring='accuracy',
                         cv=cv, n_jobs=-1)

accuracy = scores.mean()

#Print accuracy
print("PCA Measures Accuracy:", accuracy)

PCA Measures Accuracy: 0.7367972742759795
