# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('train_knn_norm.csv', index_col = 0)
df_test = pd.read_csv('test_knn_norm.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

# KNN = 50

In [7]:
def knn_run(x_train, x_test, y_train, y_test, k):
    
    knn = KNeighborsClassifier(n_neighbors = k).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [8]:
knn_accuracy = []
for i in [3,5,8,10,12,15,20,25,30,35,40,45,50]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy.append(score)

In [9]:
knn_accuracy

[(3, 0.82766244933054911),
 (5, 0.83355853089301069),
 (8, 0.84240265323670316),
 (10, 0.8435081685296647),
 (12, 0.84645620931089549),
 (15, 0.84829873479916473),
 (20, 0.84743888957130575),
 (25, 0.849281415059575),
 (30, 0.85001842525488269),
 (35, 0.84903574499447243),
 (40, 0.84903574499447243),
 (45, 0.84964992015722884),
 (50, 0.85038693035253654)]

In [10]:
knn_accuracy_2 = []
for i in [60,70,80,90,100]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy_2.append(score)

In [11]:
knn_accuracy_2

[(60, 0.84989559022233141),
 (70, 0.84915858002702371),
 (80, 0.84719321950620319),
 (90, 0.84768455963640832),
 (100, 0.84731605453875447)]

best k for knn = 30

# Random Forest = 80

In [12]:
rf = RandomForestClassifier()

In [13]:
def rf_run(x_train, x_test, y_train, y_test, k):
    
    rf = RandomForestClassifier(n_estimators = k).fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [14]:
rf_accuracy = []
for i in [3,5,8,10,15,20,30,40,50,60,70,80,90,100]:
    score = rf_run(X_train, X_test, y_train, y_test, i)
    rf_accuracy.append(score)

In [15]:
rf_accuracy

[(3, 0.83085616017688246),
 (5, 0.84510502395283138),
 (8, 0.84682471440854934),
 (10, 0.85198378577570322),
 (15, 0.85235229087335707),
 (20, 0.8556688367522417),
 (30, 0.85812553740326736),
 (40, 0.86070507308684441),
 (50, 0.85873971256602388),
 (60, 0.85984522785898543),
 (70, 0.85837120746837003),
 (80, 0.86119641321704954),
 (90, 0.86070507308684441),
 (100, 0.86045940302174184)]

# Logistic Regression C = 1

In [16]:
def lr_run(x_train, x_test, y_train, y_test, k):
    
    lr = LogisticRegression(C = k).fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [17]:
lr_accuracy = []
for i in [0.01,0.1,1,10,100]:
    score = lr_run(X_train, X_test, y_train, y_test, i)
    lr_accuracy.append(score)

In [18]:
lr_accuracy

[(0.01, 0.85419481636162631),
 (0.1, 0.85763419727306223),
 (1, 0.85787986733816479),
 (10, 0.85702002211030581),
 (100, 0.85665151701265196)]

# SVM rbf C = 10, gamma = 'auto'

parameters_svm = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

### RBF gamma = 'auto' 

In [19]:
def svm_run(x_train, x_test, y_train, y_test, k):
    
    svm = SVC(C = k).fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [20]:
a = svm_run(X_train, X_test, y_train, y_test, 1)

(1, 0.8571428571428571)

In [21]:
b = svm_run(X_train, X_test, y_train, y_test, 10)

In [22]:
b

(10, 0.86181058837980595)

In [23]:
c = svm_run(X_train, X_test, y_train, y_test, 100)

In [24]:
c

(100, 0.85947672276133158)

In [25]:
d = svm_run(X_train, X_test, y_train, y_test, 1000)

In [26]:
d

(1000, 0.85333497113376733)

### RBF 

In [27]:
def svm_rbf(x_train, x_test, y_train, y_test, k,l):
    
    svm = SVC(C = k, gamma = l, kernel = 'rbf').fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [28]:
e = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.001)

In [29]:
e

(100, 0.85628301191499812)

In [30]:
f = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.0001)

In [31]:
f

(100, 0.85554600171969042)

In [32]:
g = svm_rbf(X_train, X_test, y_train, y_test, 10, 0.001)

In [33]:
g

(10, 0.85652868198010068)

In [34]:
h = svm_rbf(X_train, X_test, y_train, y_test, 10, 0.01)

In [35]:
h

(10, 0.86156491831470339)

In [36]:
i = svm_rbf(X_train, X_test, y_train, y_test, 1, 0.001)

In [37]:
i

(1, 0.84940425009212628)

In [38]:
j = svm_rbf(X_train, X_test, y_train, y_test, 1, 0.01)

In [39]:
j

(1, 0.85665151701265196)

# Naive Bayes

There are no adjustable parameters for naive bayes (other than prior)