# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('mode_train.csv', index_col = 0)
df_test = pd.read_csv('mode_test.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

# KNN

In [None]:
def knn_run(x_train, x_test, y_train, y_test, k):
    
    knn = KNeighborsClassifier(n_neighbors = k).fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [None]:
knn_accuracy = []
for i in [3,5,8,10,12,15,20,25,30,35,40,45,50]:
    score = knn_run(X_train, X_test, y_train, y_test, i)
    knn_accuracy.append(score)

In [None]:
knn_accuracy

best k for knn = 30

# Random Forest

In [None]:
rf = RandomForestClassifier()

In [None]:
def rf_run(x_train, x_test, y_train, y_test, k):
    
    rf = RandomForestClassifier(n_estimators = k).fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [None]:
rf_accuracy = []
for i in [3,5,8,10,15,20,30,40,50,60,70,80,90,100]:
    score = rf_run(X_train, X_test, y_train, y_test, i)
    rf_accuracy.append(score)

In [None]:
rf_accuracy

# Logistic Regression 

In [None]:
lr = LogisticRegressionClassifier()

In [None]:
parameters_lr = [{'C': [0.01,0.1,1,10,100]}]

In [None]:
clf = GridSearchCV(lr, parameters_lr, cv=5)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.score(X_train, y_train)

# SVM

### Linear

In [11]:
def svm_run(x_train, x_test, y_train, y_test, k):
    
    svm = SVC(C = k).fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [12]:
a = svm_run(X_train, X_test, y_train, y_test, 1)

(1, 0.83515538631617736)

In [13]:
b = svm_run(X_train, X_test, y_train, y_test, 10)

(10, 0.83834909716251071)

In [14]:
c = svm_run(X_train, X_test, y_train, y_test, 100)

(100, 0.84006878761822867)

In [21]:
d = svm_run(X_train, X_test, y_train, y_test, 1000)

(1000, 0.83798059206485687)

# RBF

In [23]:
def svm_rbf(x_train, x_test, y_train, y_test, k,l):
    
    svm = SVC(C = k, gamma = l, kernel = 'rbf').fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    y_accuracy = accuracy_score(y_pred, y_test)
    
    return (k, y_accuracy)

In [24]:
e = svm_rbf(X_train, X_test, y_train, y_test, 1000, 0.001)

(1000, 0.83650657167424147)

In [26]:
f = svm_rbf(X_train, X_test, y_train, y_test, 1000, 0.0001)

(1000, 0.8147647709126643)

In [28]:
g = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.001)

(100, 0.82581992384227987)

In [30]:
h = svm_rbf(X_train, X_test, y_train, y_test, 100, 0.01)

(100, 0.83945461245547226)

In [None]:
parameters_svm = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

# Naive Bayes

There are no adjustable parameters for naive bayes (other than prior)