# Iris Data Set

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC



### Data preprocessing

In [2]:

iris = datasets.load_iris()
X = iris.data
Y = (iris.target > 1.5).reshape(-1,1)
X_and_Y = np.hstack((X, Y))

## 80/20 Partition

In [3]:
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.8*len(X))]
    X_test = X[int(0.8*len(X)):]
    y_train = Y[:int(0.8*len(Y))]
    y_test = Y[int(0.8*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.SVC(kernel = 'linear')
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.SVC(kernel = 'linear', C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 7 is: 0.966666666667
KNN test accuracy with 7 is: 0.966666666667
SVM train accuracy with 0.1 is: 0.966666666667
SVM test accuracy with 0.1 is: 0.9
random train accuracy with 25 is: 1.0
random test accuracy with 25 is: 0.966666666667
2 Round
KNN train accuracy with 8 is: 0.975
KNN test accuracy with 8 is: 0.866666666667
SVM train accuracy with 0.1 is: 0.966666666667
SVM test accuracy with 0.1 is: 0.9
random train accuracy with 1 is: 1.0
random test accuracy with 1 is: 0.933333333333
3 Round
KNN train accuracy with 10 is: 0.966666666667
KNN test accuracy with 10 is: 1.0
SVM train accuracy with 0.1 is: 0.958333333333
SVM test accuracy with 0.1 is: 1.0
random train accuracy with 1 is: 1.0
random test accuracy with 1 is: 0.966666666667


In [4]:
print("Average KNN Classifier 80/20 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 80/20 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 80/20 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 80/20 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 80/20 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 80/20 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 80/20 train accuracy:0.969444444444
Average KNN Classifier 80/20 test accuracy:0.944444444444
Average SVM Classifier 80/20 train accuracy:0.963888888889
Average SVM Classifier 80/20 test accuracy:0.933333333333
Average Random Forest Classifier 80/20 train accuracy:1.0
Average Random Forest Classifier 80/20 test accuracy:0.955555555556


## 50/50 Partition

In [5]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.5*len(X))]
    X_test = X[int(0.5*len(X)):]
    y_train = Y[:int(0.5*len(Y))]
    y_test = Y[int(0.5*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.SVC(kernel = 'linear')
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.SVC(kernel = 'linear', C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 1 is: 1.0
KNN test accuracy with 1 is: 0.946666666667
SVM train accuracy with 0.1 is: 0.946666666667
SVM test accuracy with 0.1 is: 0.96
random train accuracy with 25 is: 1.0
random test accuracy with 25 is: 0.92
2 Round
KNN train accuracy with 16 is: 0.96
KNN test accuracy with 16 is: 0.933333333333
SVM train accuracy with 0.1 is: 0.96
SVM test accuracy with 0.1 is: 0.986666666667
random train accuracy with 25 is: 1.0
random test accuracy with 25 is: 0.906666666667
3 Round
KNN train accuracy with 1 is: 1.0
KNN test accuracy with 1 is: 0.973333333333
SVM train accuracy with 0.1 is: 0.933333333333
SVM test accuracy with 0.1 is: 0.946666666667
random train accuracy with 25 is: 0.986666666667
random test accuracy with 25 is: 0.946666666667


In [6]:
print("Average KNN Classifier 50/50 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 50/50 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 50/50 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 50/50 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 50/50 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 50/50 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 50/50 train accuracy:0.986666666667
Average KNN Classifier 50/50 test accuracy:0.951111111111
Average SVM Classifier 50/50 train accuracy:0.946666666667
Average SVM Classifier 50/50 test accuracy:0.964444444444
Average Random Forest Classifier 50/50 train accuracy:0.995555555556
Average Random Forest Classifier 50/50 test accuracy:0.924444444444


## 20/80 Partition

In [7]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.2*len(X))]
    X_test = X[int(0.2*len(X)):]
    y_train = Y[:int(0.2*len(Y))]
    y_test = Y[int(0.2*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 10))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.SVC(kernel = 'linear')
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.SVC(kernel = 'linear', C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 1 is: 1.0
KNN test accuracy with 1 is: 0.958333333333
SVM train accuracy with 0.1 is: 1.0
SVM test accuracy with 0.1 is: 0.95
random train accuracy with 25 is: 1.0
random test accuracy with 25 is: 0.941666666667
2 Round
KNN train accuracy with 1 is: 1.0
KNN test accuracy with 1 is: 0.85
SVM train accuracy with 0.1 is: 0.966666666667
SVM test accuracy with 0.1 is: 0.833333333333
random train accuracy with 1 is: 1.0
random test accuracy with 1 is: 0.875
3 Round
KNN train accuracy with 1 is: 1.0
KNN test accuracy with 1 is: 0.958333333333
SVM train accuracy with 1e-05 is: 0.7
SVM test accuracy with 1e-05 is: 0.658333333333
random train accuracy with 25 is: 1.0
random test accuracy with 25 is: 0.941666666667


In [8]:
print("Average KNN Classifier 20/80 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 20/80 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 20/80 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 20/80 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 20/80 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 20/80 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 20/80 train accuracy:1.0
Average KNN Classifier 20/80 test accuracy:0.922222222222
Average SVM Classifier 20/80 train accuracy:0.888888888889
Average SVM Classifier 20/80 test accuracy:0.813888888889
Average Random Forest Classifier 20/80 train accuracy:1.0
Average Random Forest Classifier 20/80 test accuracy:0.919444444444


# Adult Data Set

In [9]:
import pandas as pd
X_and_Y = pd.read_csv('adult.csv', sep=r'\s*,\s*',engine = 'python', na_values = '?')
X_and_Y.dropna()
X_Y_chart = pd.get_dummies(X_and_Y, drop_first=True)
X_and_Y = X_Y_chart.values


### 80/20 

In [10]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.8*len(X))]
    X_test = X[int(0.8*len(X)):]
    y_train = Y[:int(0.8*len(Y))]
    y_test = Y[int(0.8*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 10))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, cv=3 )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 6 is: 0.7925
KNN test accuracy with 6 is: 0.8125
SVM train accuracy with 0.1 is: 0.24875
SVM test accuracy with 0.1 is: 0.2025
random train accuracy with 150 is: 0.986875
random test accuracy with 150 is: 0.8575
2 Round
KNN train accuracy with 4 is: 0.81875
KNN test accuracy with 4 is: 0.7825
SVM train accuracy with 1e-05 is: 0.238125
SVM test accuracy with 1e-05 is: 0.225
random train accuracy with 25 is: 0.9875
random test accuracy with 25 is: 0.86
3 Round
KNN train accuracy with 6 is: 0.8025
KNN test accuracy with 6 is: 0.7975
SVM train accuracy with 1e-05 is: 0.78375
SVM test accuracy with 1e-05 is: 0.81
random train accuracy with 50 is: 0.989375
random test accuracy with 50 is: 0.83


In [11]:
print("Average KNN Classifier 80/20 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 80/20 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 80/20 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 80/20 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 80/20 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 80/20 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 80/20 train accuracy:0.804583333333
Average KNN Classifier 80/20 test accuracy:0.7975
Average SVM Classifier 80/20 train accuracy:0.423541666667
Average SVM Classifier 80/20 test accuracy:0.4125
Average Random Forest Classifier 80/20 train accuracy:0.987916666667
Average Random Forest Classifier 80/20 test accuracy:0.849166666667


## 50/50

In [12]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.5*len(X))]
    X_test = X[int(0.5*len(X)):]
    y_train = Y[:int(0.5*len(Y))]
    y_test = Y[int(0.5*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 8 is: 0.798
KNN test accuracy with 8 is: 0.775
SVM train accuracy with 0.0001 is: 0.224
SVM test accuracy with 0.0001 is: 0.247
random train accuracy with 150 is: 0.989
random test accuracy with 150 is: 0.838
2 Round
KNN train accuracy with 8 is: 0.796
KNN test accuracy with 8 is: 0.783
SVM train accuracy with 0.01 is: 0.232
SVM test accuracy with 0.01 is: 0.24
random train accuracy with 100 is: 0.989
random test accuracy with 100 is: 0.837
3 Round
KNN train accuracy with 6 is: 0.797
KNN test accuracy with 6 is: 0.781
SVM train accuracy with 0.001 is: 0.793
SVM test accuracy with 0.001 is: 0.785
random train accuracy with 25 is: 0.989
random test accuracy with 25 is: 0.835


In [13]:
print("Average KNN Classifier 50/50 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 50/50 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 50/50 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 50/50 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 50/50 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 50/50 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 50/50 train accuracy:0.797
Average KNN Classifier 50/50 test accuracy:0.779666666667
Average SVM Classifier 50/50 train accuracy:0.416333333333
Average SVM Classifier 50/50 test accuracy:0.424
Average Random Forest Classifier 50/50 train accuracy:0.989
Average Random Forest Classifier 50/50 test accuracy:0.836666666667


## 20/80

In [14]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.2*len(X))]
    X_test = X[int(0.2*len(X)):]
    y_train = Y[:int(0.2*len(Y))]
    y_test = Y[int(0.2*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 10))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 8 is: 0.7725
KNN test accuracy with 8 is: 0.764375
SVM train accuracy with 0.001 is: 0.2475
SVM test accuracy with 0.001 is: 0.233125
random train accuracy with 150 is: 0.995
random test accuracy with 150 is: 0.809375
2 Round
KNN train accuracy with 2 is: 0.845
KNN test accuracy with 2 is: 0.77125
SVM train accuracy with 1e-05 is: 0.24
SVM test accuracy with 1e-05 is: 0.235
random train accuracy with 50 is: 0.99
random test accuracy with 50 is: 0.849375
3 Round
KNN train accuracy with 6 is: 0.8025
KNN test accuracy with 6 is: 0.76375
SVM train accuracy with 0.001 is: 0.7925
SVM test accuracy with 0.001 is: 0.7725
random train accuracy with 100 is: 0.995
random test accuracy with 100 is: 0.82875


In [15]:
print("Average KNN Classifier 20/80 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 20/80 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 20/80 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 20/80 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 20/80 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 20/80 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 20/80 train accuracy:0.806666666667
Average KNN Classifier 20/80 test accuracy:0.766458333333
Average SVM Classifier 20/80 train accuracy:0.426666666667
Average SVM Classifier 20/80 test accuracy:0.413541666667
Average Random Forest Classifier 20/80 train accuracy:0.993333333333
Average Random Forest Classifier 20/80 test accuracy:0.829166666667


# Bank Data Set 

In [16]:
import pandas as pd
X_and_Y = pd.read_csv('bank.csv', sep=r'\s*,\s*',engine = 'python', na_values = '?')
X_and_Y.dropna()
X_Y_chart = pd.get_dummies(X_and_Y, drop_first=False)
X_Y_chart.drop(X_Y_chart.columns[len(X_Y_chart.columns)-1], axis=1, inplace=True)
X_and_Y = X_Y_chart.values

## 80/20

In [17]:
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.8*len(X))]
    X_test = X[int(0.8*len(X)):]
    y_train = Y[:int(0.8*len(Y))]
    y_test = Y[int(0.8*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 6 is: 0.973125
KNN test accuracy with 6 is: 0.975
SVM train accuracy with 0.001 is: 0.96375
SVM test accuracy with 0.001 is: 0.9625
random train accuracy with 25 is: 0.99875
random test accuracy with 25 is: 0.985
2 Round
KNN train accuracy with 6 is: 0.973125
KNN test accuracy with 6 is: 0.975
SVM train accuracy with 1e-05 is: 0.973125
SVM test accuracy with 1e-05 is: 0.975
random train accuracy with 50 is: 0.99875
random test accuracy with 50 is: 0.9825
3 Round
KNN train accuracy with 6 is: 0.971875
KNN test accuracy with 6 is: 0.98
SVM train accuracy with 0.0001 is: 0.971875
SVM test accuracy with 0.0001 is: 0.98
random train accuracy with 25 is: 0.9975
random test accuracy with 25 is: 0.9825


In [18]:
print("Average KNN Classifier 80/20 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 80/20 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 80/20 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 80/20 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 80/20 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 80/20 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 80/20 train accuracy:0.972708333333
Average KNN Classifier 80/20 test accuracy:0.976666666667
Average SVM Classifier 80/20 train accuracy:0.969583333333
Average SVM Classifier 80/20 test accuracy:0.9725
Average Random Forest Classifier 80/20 train accuracy:0.998333333333
Average Random Forest Classifier 80/20 test accuracy:0.983333333333


## 50/50

In [19]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.5*len(X))]
    X_test = X[int(0.5*len(X)):]
    y_train = Y[:int(0.5*len(Y))]
    y_test = Y[int(0.5*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 2 is: 0.981
KNN test accuracy with 2 is: 0.973
SVM train accuracy with 1e-05 is: 0.975
SVM test accuracy with 1e-05 is: 0.972
random train accuracy with 25 is: 0.998
random test accuracy with 25 is: 0.978
2 Round
KNN train accuracy with 4 is: 0.974
KNN test accuracy with 4 is: 0.974
SVM train accuracy with 0.01 is: 0.97
SVM test accuracy with 0.01 is: 0.967
random train accuracy with 50 is: 0.996
random test accuracy with 50 is: 0.985
3 Round
KNN train accuracy with 2 is: 0.978
KNN test accuracy with 2 is: 0.97
SVM train accuracy with 0.0001 is: 0.972
SVM test accuracy with 0.0001 is: 0.972
random train accuracy with 25 is: 0.998
random test accuracy with 25 is: 0.973


In [20]:
print("Average KNN Classifier 50/50 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 50/50 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 50/50 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 50/50 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 50/50 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 50/50 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 50/50 train accuracy:0.977666666667
Average KNN Classifier 50/50 test accuracy:0.972333333333
Average SVM Classifier 50/50 train accuracy:0.972333333333
Average SVM Classifier 50/50 test accuracy:0.970333333333
Average Random Forest Classifier 50/50 train accuracy:0.997333333333
Average Random Forest Classifier 50/50 test accuracy:0.978666666667


## 20/80

In [21]:
KNN_test_acc = []
KNN_train_acc = []
SVM_test_acc = []
SVM_train_acc = []
random_test_acc = []
random_train_acc = []

for i in range (0,3):
    # Random data
    np.random.shuffle(X_and_Y)
    X_and_Y = X_and_Y[0:2000]
    X = X_and_Y[:, 0:-1]
    Y = X_and_Y[:, -1]
    # split into train and test
    X_train = X[:int(0.2*len(X))]
    X_test = X[int(0.2*len(X)):]
    y_train = Y[:int(0.2*len(Y))]
    y_test = Y[int(0.2*len(Y)):]
    
    print(str(i+1) +" Round")
    
    # Using KNN
    k_range = list(range(1, 10))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf_knn = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy')
    clf_knn.fit(X_train, y_train)
    # Train with best param
    optimal_knn = KNeighborsClassifier(clf_knn.best_params_['n_neighbors'])
    optimal_knn.fit(X_train, y_train) 
    test_pred = optimal_knn.predict(X_test) 
    train_pred = optimal_knn.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("KNN train accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(train_acc))
    print("KNN test accuracy with "+str(clf_knn.best_params_['n_neighbors'])+" is: " + str(test_acc))
    KNN_test_acc.append(test_acc)
    KNN_train_acc.append(train_acc)
    
    # Using SVM
    classifier = svm.LinearSVC()
    C_list     = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1] # Different C to try.
    parameters = {'C': C_list}
    clf_svm = GridSearchCV(classifier, parameters, return_train_score = 'true' )
    clf_svm.fit(X_train, y_train)
    optimal_svm = svm.LinearSVC(C = clf_svm.best_params_['C'] )
    optimal_svm.fit(X_train, y_train)
    test_pred = optimal_svm.predict(X_test) 
    train_pred = optimal_svm.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("SVM train accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(train_acc))
    print("SVM test accuracy with "+str(clf_svm.best_params_['C'])+" is: " + str(test_acc))
    SVM_test_acc.append(test_acc)
    SVM_train_acc.append(train_acc)
    
    # Using Random Forest
    random = RandomForestClassifier()
    n_list = [1,25,50,100,150]
    parameters_random = {'n_estimators' : n_list}
    clf_random = GridSearchCV(random, parameters_random, cv=3) 
    clf_random.fit(X_train, y_train)
    #choose the best parameter and train again
    optimal_random = RandomForestClassifier(n_estimators = clf_random.best_params_['n_estimators'])
    optimal_random = random.fit(X_train, y_train)
    test_pred = optimal_random.predict(X_test) 
    train_pred = optimal_random.predict(X_train)
    train_acc = sum([train_pred[i] == y_train[i] for i in range(len(y_train))])/len(y_train)
    test_acc = sum([test_pred[i] == y_test[i] for i in range(len(y_test))])/len(y_test)
    print("random train accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(train_acc))
    print("random test accuracy with "+str(clf_random.best_params_['n_estimators'])+" is: " + str(test_acc))
    random_test_acc.append(test_acc)
    random_train_acc.append(train_acc)

1 Round
KNN train accuracy with 2 is: 0.9825
KNN test accuracy with 2 is: 0.97125
SVM train accuracy with 1e-05 is: 0.9825
SVM test accuracy with 1e-05 is: 0.97125
random train accuracy with 25 is: 0.9975
random test accuracy with 25 is: 0.97375
2 Round
KNN train accuracy with 4 is: 0.975
KNN test accuracy with 4 is: 0.973125
SVM train accuracy with 0.0001 is: 0.975
SVM test accuracy with 0.0001 is: 0.973125
random train accuracy with 25 is: 0.9975
random test accuracy with 25 is: 0.97375
3 Round
KNN train accuracy with 2 is: 0.985
KNN test accuracy with 2 is: 0.97125
SVM train accuracy with 0.001 is: 0.9775
SVM test accuracy with 0.001 is: 0.97
random train accuracy with 25 is: 0.9975
random test accuracy with 25 is: 0.9725


In [22]:
print("Average KNN Classifier 20/80 train accuracy:" + str(sum(KNN_train_acc)/len(KNN_train_acc)))
print("Average KNN Classifier 20/80 test accuracy:" + str(sum(KNN_test_acc)/len(KNN_test_acc)))
print("Average SVM Classifier 20/80 train accuracy:" + str(sum(SVM_train_acc)/len(SVM_train_acc)))
print("Average SVM Classifier 20/80 test accuracy:" + str(sum(SVM_test_acc)/len(SVM_test_acc)))
print("Average Random Forest Classifier 20/80 train accuracy:" + str(sum(random_train_acc)/len(random_train_acc)))
print("Average Random Forest Classifier 20/80 test accuracy:" + str(sum(random_test_acc)/len(random_test_acc)))

Average KNN Classifier 20/80 train accuracy:0.980833333333
Average KNN Classifier 20/80 test accuracy:0.971875
Average SVM Classifier 20/80 train accuracy:0.978333333333
Average SVM Classifier 20/80 test accuracy:0.971458333333
Average Random Forest Classifier 20/80 train accuracy:0.9975
Average Random Forest Classifier 20/80 test accuracy:0.973333333333
