# Import 

In [None]:
# Data structure libaries
import pandas as pd
import numpy as np

# ML libaries
from sklearn import svm
from sklearn.model_selection import train_test_split

# Visualization libraries
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset

In [None]:
sleep_data = pd.read_csv('./processed_data.csv')

# Data Preprocessing

In [1]:
# Returns a list [] of normalized features
def normalize_feature(feature_to_normalize, max_val, min_val) :
    feature_length = len(feature_to_normalize)
    normalized = []
    for i in range(feature_length) :
        normalized.append(((feature_to_normalize[i] - min_val) / (max_val - min_val))[0])
    return normalized

In [None]:
# Removing unnamed column
sleep_data = sleep_data.loc[:, ~sleep_data.columns.str.contains('^Unnamed')]

In [None]:
# Removing SEQN
sleep_data = sleep_data.loc[:, ~sleep_data.columns.str.contains('SEQN')]

In [None]:
# Rearranging columns to the dataframe so that last column is the label
sleep_data.columns =[['RIAGENDR', 'RIDAGEYR', 'SLQ040', 'SLQ050', 'PAQ605', 'PAQ620', 'SMQ040', 
                      'ALQ130', 'DIQ010', 'BMXBMI', 'physical_activity', 'smoking', 'alcohol', 'diabetes',
                      'DR1IMAGN', 'DR1IIRON', 'DR1ISFAT', 'DR1IPFAT', 'DR1IMFAT', 
                      'DR2IMAGN', 'DR2IIRON', 'DR2ISFAT', 'DR2IPFAT', 'DR2IMFAT','sleep_disorder']]

# Normalize data
maxList = sleep_data.max()
minList = sleep_data.min()

# Lists that store the normalized values
RIAGENDR_norm = normalize_feature(sleep_data['RIAGENDR'].values.tolist(), maxList[0], minList[0])
RIDAGEYR_norm = normalize_feature(sleep_data['RIDAGEYR'].values.tolist(), maxList[1], minList[1])
SLQ040_norm = normalize_feature(sleep_data['SLQ040'].values.tolist(), maxList[2], minList[2])
SLQ050_norm = normalize_feature(sleep_data['SLQ050'].values.tolist(), maxList[3], minList[3])

PAQ605_norm = normalize_feature(sleep_data['PAQ605'].values.tolist(), maxList[4], minList[4])
PAQ620_norm = normalize_feature(sleep_data['PAQ620'].values.tolist(), maxList[5], minList[5])
SMQ040_norm = normalize_feature(sleep_data['SMQ040'].values.tolist(), maxList[6], minList[6])
ALQ130_norm = normalize_feature(sleep_data['ALQ130'].values.tolist(), maxList[7], minList[7])

DIQ010_norm = normalize_feature(sleep_data['DIQ010'].values.tolist(), maxList[8], minList[8])
BMXBMI_norm = normalize_feature(sleep_data['BMXBMI'].values.tolist(), maxList[9], minList[9])
physical_activity_norm = normalize_feature(sleep_data['physical_activity'].values.tolist(), maxList[10], minList[10])
smoking_norm = normalize_feature(sleep_data['smoking'].values.tolist(), maxList[11], minList[11])

alcohol_norm = normalize_feature(sleep_data['alcohol'].values.tolist(), maxList[12], minList[12])
diabetes_norm = normalize_feature(sleep_data['diabetes'].values.tolist(), maxList[13], minList[13])
DR1IMAGN_norm = normalize_feature(sleep_data['DR1IMAGN'].values.tolist(), maxList[14], minList[14])
DR1IIRON_norm = normalize_feature(sleep_data['DR1IIRON'].values.tolist(), maxList[15], minList[15])

DR1ISFAT_norm = normalize_feature(sleep_data['DR1ISFAT'].values.tolist(), maxList[16], minList[16])
DR1IPFAT_norm = normalize_feature(sleep_data['DR1IPFAT'].values.tolist(), maxList[17], minList[17])
DR1IMFAT_norm = normalize_feature(sleep_data['DR1IMFAT'].values.tolist(), maxList[18], minList[18])
DR2IMAGN_norm = normalize_feature(sleep_data['DR2IMAGN'].values.tolist(), maxList[19], minList[19])

DR2IIRON_norm = normalize_feature(sleep_data['DR2IIRON'].values.tolist(), maxList[20], minList[20])
DR2ISFAT_norm = normalize_feature(sleep_data['DR2ISFAT'].values.tolist(), maxList[21], minList[21])
DR2IPFAT_norm = normalize_feature(sleep_data['DR2IPFAT'].values.tolist(), maxList[22], minList[22])
DR2IMFAT_norm = normalize_feature(sleep_data['DR2IMFAT'].values.tolist(), maxList[23], minList[23])

sleep_disorder_norm = normalize_feature(sleep_data['sleep_disorder'].values.tolist(), maxList[24], minList[24])

# Create a new dataframe with the normalized values
normalized_features = {
         'RIAGENDR': RIAGENDR_norm,
         'RIDAGEYR': RIDAGEYR_norm, 
         'SLQ040': SLQ040_norm,
         'SLQ050': SLQ050_norm,
         'PAQ605': PAQ605_norm,
         'PAQ620': PAQ620_norm, 
         'SMQ040': SMQ040_norm,
         'ALQ130': ALQ130_norm,
         'DIQ010': DIQ010_norm,
         'BMXBMI': BMXBMI_norm,
         'physical_activity': physical_activity_norm, 
         'smoking': smoking_norm,
         'alcohol': alcohol_norm,
         'diabetes': diabetes_norm,
         'DR1IMAGN': DR1IMAGN_norm,
         'DR1IIRON': DR1IIRON_norm, 
         'DR1ISFAT': DR1ISFAT_norm,
         'DR1IPFAT': DR1IPFAT_norm,
         'DR1IMFAT': DR1IMFAT_norm,
         'DR2IMAGN': DR2IMAGN_norm,
         'DR2IIRON': DR2IIRON_norm, 
         'DR2ISFAT': DR2ISFAT_norm,
         'DR2IPFAT': DR2IPFAT_norm,
         'DR2IMFAT': DR2IMFAT_norm,
         'sleep_disorder': sleep_disorder_norm}

In [None]:
# Convert resultant dictionary into dataframe
norm_df = pd.DataFrame.from_dict(normalized_features)

In [None]:
# Drop columns that may be troublesome. For LR, these specific columns made accuracy 100% regardless of regularization strength
# That may mean that there isn't enough variety within these columns so they are close to 1:1 with the label prediction
norm_df = norm_df.drop(columns=['RIAGENDR', 'SLQ040', 'SLQ050', 'PAQ605', 'PAQ620', 'SMQ040', 'ALQ130', 'DIQ010',
                                'physical_activity', 'diabetes'])

# Train - Validation - Test Split

In [None]:
# In the end, we get 3 groups:

# feature_train: target features of the training set
# label_train: target label of the training set

# feature_test: target features of the testing set
# label_test: target label of the testing set

# feature_val: target features of the validation set
# label_val: target label of the validation set

In [None]:
# All feature data
features = norm_df.iloc[:,:-1]
# All label data
labels = norm_df.iloc[:,-1:]

# Split into train and validation sets
feature_train, feature_val, label_train, label_val = train_test_split(features, labels, test_size=0.3)

# Hyperparameter Tuning

These are ones that I know. Please choose whichever you prefer.

Hyperparameter can include be the kernel type:
    1. Linear Kernel
    2. Polynomial Kernel
    3. Radial Basis Function (RBF) Kernel
    
Can also include regularization term (C) : Lower values of C creates a hyperplane that separates the classes, though by a smaller margin. Too low and we will overfit. Higher values of C creates a hyperplane that separates the classes, though by a larger margin. Too high and we will underfit.
    
Can also include gamma term (g) : Lower values of g calcuates the optimal hyperplane using only a few points. Too low and we will underfit. Higher values of g calculates the optimal hyperplance using many points. Too high and we will overfit. 

# SVM

In [None]:
# Using remaining training data, split into testing set. Although, we want to use our original training data
# from above so won't keep this set of training data
feature_train_dump, feature_test, label_train_dump, label_test = train_test_split(feature_train, label_train, test_size=0.3)

This has helped me in the past: https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

# Interpret Results

# Driver (to run entire experiment over k iterations, if needed)