## Load Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Load Data

In [2]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

## Pre-Process data

In [3]:
train["Condition"] = train.CONDITION.map({"H":0, "D":1})

test["Condition"] = test.CONDITION.map({"H":0, "D":1})
train.drop("CONDITION", axis = 1,inplace = True)
test.drop("CONDITION", axis=1, inplace = True)

In [4]:
X_train = train.iloc[:,0:12]
y_train = train.iloc[:,12]
X_test= test.iloc[:,0:12]
y_test = test.iloc[:,12]

In [7]:
scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = scaler.transform(X_train)
print(X_train_scaled.mean(axis=0))

[-3.20786788e-17  6.41573576e-17 -6.41573576e-17  1.28314715e-16
  8.01966970e-17  8.01966970e-17 -1.28314715e-16 -1.60393394e-16
 -1.28314715e-16 -1.60393394e-16  0.00000000e+00 -1.44354055e-16]


In [9]:
scaler = StandardScaler().fit(X_test)

In [10]:
X_test_scaled = scaler.transform(X_test)

## Logistic Regression 

In [None]:
model_logreg = LogisticRegression() 
model_logreg.fit(X_train,y_train)


In [None]:
y_pred = model_logreg.predict(X_test)

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model_logreg.score(X_test, y_test)))

## SVM 

In [37]:
#import SVC
from sklearn.svm import SVC
#import for Cross-Validation
from sklearn.model_selection import GridSearchCV

# parameters for linear SVM
parameters = {'C': [1, 10, 100]}

#run linear SVM
linear_SVM = SVC(kernel='linear')

#find best model uusing 5-fold CV 
#and train it using all the training data
best_of_kernels = []  ## in order to compare and select the best svm
# ADD CODE
###
clf = GridSearchCV(linear_SVM, parameters, cv = 5)
clf.fit(X_train_scaled, y = y_train)
GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10, 100], 'kernel': ['linear']})
###
best_of_kernels.append(clf)
print ('RESULTS FOR LINEAR KERNEL\n')
print(clf.cv_results_)  # is added

print("Best parameters set found:")
print(clf.best_params_)# ADD CODE

print("Score with best parameters:")
print(clf.best_score_)# ADD CODE

print("\nAll scores on the grid:")
# ADD CODE
all_scores=clf.cv_results_['mean_test_score']
print(all_scores)

# parameters for poly with degree 2 kernel
parameters = {'C': [1, 10, 100],'gamma':[0.01,0.1,1.]}

#run SVM with poly of degree 2 kernel
poly2_SVM = SVC(kernel='poly',degree=2)

# ADD CODE: DO THE SAME AS ABOVE FOR POLYNOMIAL KERNEL WITH DEGREE=2
###
clf = GridSearchCV(poly2_SVM, parameters, cv = 5)
clf.fit(X_train_scaled, y = y_train)
GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10, 100], 'kernel': ['poly'], 'degree': ['2']})
###
best_of_kernels.append(clf)
print ('\nRESULTS FOR POLY DEGREE=2 KERNEL\n')
print(clf.cv_results_)  # is added
print("Best parameters set found:")
print(clf.best_params_)# ADD CODE

print("Score with best parameters:")
print(clf.best_score_)# ADD CODE

print("\nAll scores on the grid:")
# ADD CODE
all_scores=clf.cv_results_['mean_test_score']
print(all_scores)
# parameters for rbf SVM
parameters = {'C': [1, 10, 100],'gamma':[0.01,0.1,1.]}

#run SVM with rbf kernel
rbf_SVM = SVC(kernel='rbf')
# ADD CODE: DO THE SAME AS ABOVE FOR RBF KERNEL
###
clf = GridSearchCV(rbf_SVM, parameters, cv = 5)
clf.fit(X_train_scaled, y = y_train)
GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma':[0.01,0.1,1.]})
###
best_of_kernels.append(clf)
print ('\nRESULTS FOR rbf KERNEL\n')
print(clf.cv_results_)  # is added
print("Best parameters set found:")
print(clf.best_params_)# ADD CODE

print("Score with best parameters:")
print(clf.best_score_)# ADD CODE

print("\nAll scores on the grid:")
# ADD CODE
all_scores=clf.cv_results_['mean_test_score']
print(all_scores)

RESULTS FOR LINEAR KERNEL

{'mean_fit_time': array([0.01265635, 0.04628639, 0.3438652 ]), 'std_fit_time': array([0.00228965, 0.00566234, 0.03685078]), 'mean_score_time': array([0.00111084, 0.00074863, 0.00082507]), 'std_score_time': array([1.87829746e-04, 8.56762397e-05, 7.65084589e-05]), 'param_C': masked_array(data=[1, 10, 100],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1}, {'C': 10}, {'C': 100}], 'split0_test_score': array([0.74157303, 0.74719101, 0.74719101]), 'split1_test_score': array([0.76271186, 0.75706215, 0.75706215]), 'split2_test_score': array([0.72881356, 0.72881356, 0.72881356]), 'split3_test_score': array([0.72316384, 0.72316384, 0.72316384]), 'split4_test_score': array([0.82485876, 0.82485876, 0.82485876]), 'mean_test_score': array([0.75622421, 0.75621786, 0.75621786]), 'std_test_score': array([0.03691011, 0.03643442, 0.03643442]), 'rank_test_score': array([1, 2, 2], dtype=int32)}
Best parameters set foun

## TO DO 2
For the "best" SVM kernel and choice of parameters from above, train the model on the entire training set and measure the training error. Also make predictions on the test set and measure the test error. Print the training and the test error.

In [38]:
scores_of_best_kernels = [x.best_score_ for x in best_of_kernels]
index_of_best_svm = scores_of_best_kernels.index(max(scores_of_best_kernels))

In [39]:
#get training and test error for the best SVM model from CV
best_SVM = best_of_kernels[index_of_best_svm]# ADD CODE

# fit the model on the entire training set
# ADD CODE
best_SVM.fit(X_train_scaled, y_train)
prediction_svm = best_SVM.predict(X_test_scaled)
#get the training and test error
training_error = 1. - best_SVM.score(X_train_scaled,y_train)
test_error = 1. - best_SVM.score(X_test_scaled,y_test)

print ("Best SVM training error: %f" % training_error)
print ("Best SVM test error: %f" % test_error)

Best SVM training error: 0.136569
Best SVM test error: 0.130000


## Neural Networks 

In [19]:
ID = 2014431# COMPLETE
np.random.seed(ID)

In [35]:
#MLPclassifier requires in input the parameter hidden_layer_sizes, that is a tuple specifying the number of 
#neurons in the hidden layers; for example: (10,) means that there is only 1 hidden layer with 10 neurons; 
#(10,50) means that there are 2 hidden layers, the first with 10 neurons, the second with 50 neurons
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

#these are examples of possible architectures you can test, but feel free to use different architectures! 
hl_parameters = {'hidden_layer_sizes': [(10,), (50,),(100,), (10,10,)]}

mlp_cv = MLPClassifier(max_iter=300, alpha=1e-4, solver='sgd', tol=1e-4, learning_rate_init=.1,random_state=ID) #ADD YOUR CODE

clf = GridSearchCV(estimator=mlp_cv,param_grid=hl_parameters,cv=5)
clf.fit(X_train_scaled, y = y_train)
print ('RESULTS FOR NN\n')
print(clf.cv_results_)  # is added

print("Best parameters set found:")
print(clf.best_params_)#ADD YOUR CODE

print("Score with best parameters:")
print(clf.best_score_)#ADD YOUR CODE

print("\nAll scores on the grid:")
all_scores=clf.cv_results_['mean_test_score']
print(all_scores)#ADD YOUR CODE



RESULTS FOR NN

{'mean_fit_time': array([0.19818602, 0.38441582, 0.45415421, 0.2426095 ]), 'std_fit_time': array([0.04025968, 0.04414265, 0.0553612 , 0.01362541]), 'mean_score_time': array([0.00041838, 0.00047469, 0.00044127, 0.00028362]), 'std_score_time': array([1.45207164e-04, 1.63621444e-04, 1.19543028e-04, 3.82838179e-06]), 'param_hidden_layer_sizes': masked_array(data=[(10,), (50,), (100,), (10, 10)],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'hidden_layer_sizes': (10,)}, {'hidden_layer_sizes': (50,)}, {'hidden_layer_sizes': (100,)}, {'hidden_layer_sizes': (10, 10)}], 'split0_test_score': array([0.7247191 , 0.7247191 , 0.74157303, 0.73033708]), 'split1_test_score': array([0.78531073, 0.76836158, 0.7740113 , 0.76836158]), 'split2_test_score': array([0.74576271, 0.74576271, 0.76271186, 0.77966102]), 'split3_test_score': array([0.79096045, 0.81355932, 0.7740113 , 0.75141243]), 'split4_test_score': array([0.81355932,

In [36]:
#get training and test error for the best NN model from CV

mlp = MLPClassifier(hidden_layer_sizes = clf.best_params_['hidden_layer_sizes'], verbose=True, max_iter=300, alpha=1e-4, solver='sgd', tol=1e-4, learning_rate_init=.1, random_state=ID) #ADD YOUR CODE
clf = mlp.fit(X_train_scaled, y_train)
prediction_five_hundred = clf.predict(X_test_scaled)

training_error = 1. - clf.score(X_train_scaled, y_train)#ADD YOUR CODE

test_error = 1. - clf.score(X_test_scaled, y_test)#ADD YOUR CODE

print ('\nRESULTS FOR BEST NN\n')

print ("Best NN training error: %f" % training_error)
print ("Best NN test error: %f" % test_error)

Iteration 1, loss = 0.65559278
Iteration 2, loss = 0.53204306
Iteration 3, loss = 0.49089578
Iteration 4, loss = 0.47301493
Iteration 5, loss = 0.46468492
Iteration 6, loss = 0.44770081
Iteration 7, loss = 0.43966800
Iteration 8, loss = 0.43032922
Iteration 9, loss = 0.42331258
Iteration 10, loss = 0.41846399
Iteration 11, loss = 0.41359081
Iteration 12, loss = 0.40716853
Iteration 13, loss = 0.40510785
Iteration 14, loss = 0.40099481
Iteration 15, loss = 0.39753083
Iteration 16, loss = 0.39210447
Iteration 17, loss = 0.39122692
Iteration 18, loss = 0.38648561
Iteration 19, loss = 0.37895406
Iteration 20, loss = 0.37461914
Iteration 21, loss = 0.37115012
Iteration 22, loss = 0.36755299
Iteration 23, loss = 0.36395344
Iteration 24, loss = 0.36368359
Iteration 25, loss = 0.35912774
Iteration 26, loss = 0.35512296
Iteration 27, loss = 0.35675328
Iteration 28, loss = 0.34859674
Iteration 29, loss = 0.34685162
Iteration 30, loss = 0.34921406
Iteration 31, loss = 0.34764687
Iteration 32, los