# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('mode_train.csv', index_col = 0)
df_test = pd.read_csv('mode_test.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]

# Individual Algorithms

In [7]:
def rforest(X_train, y_train, X_test):

    rforest = RandomForestClassifier(n_estimators = 90)
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [8]:
def knn(X_train, y_train, X_test):
    
    knn = KNeighborsClassifier(n_neighbors = 30)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [9]:
def nb(X_train, y_train, X_test):
    
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred


In [10]:
def lr(X_train, y_train, X_test):

    lr = LogisticRegression(C = 1)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [11]:
def svm(X_train, y_train, X_test):
    svm = SVC(C = 100)
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

# Ensemble Function


In [12]:
def ensemble(X_train, y_train, X_test):
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
    
    for i in range(len(X_test)):
        final_y.append(stats.mode([r_y[i], k_y[i], l_y[i], n_y[i], s_y[i]])[0][0])
        
    return final_y

# PCC

In [13]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [14]:
def ppc_features(x,y):

    a = np.array(y)
    filt_feat = []

    for i in range(x.shape[1]):
        b = np.array(x.iloc[:,i])
        val = np.abs(pearson(a,b)) # absolute value of R
        filt_feat.append([val,i])

    filt_sort = sorted(filt_feat, reverse=True)
    
    rank = np.array(list(range(len(filt_sort))))
    rank = [x+1 for x in rank]
    feat_n = []
    r_score = []
    for i in range(len(filt_sort)):
        feat_n.append(filt_sort[i][1])
        r_score.append(filt_sort[i][0])
        
    Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
    Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
    Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)
    
    return Filter_Ranks

In [15]:
filter_method = ppc_features(X_train, y_train)

In [16]:
filter_method.head()

Unnamed: 0,Rank,Feature #,R_score
0,1,20,0.444696
1,2,12,0.401035
2,3,22,0.31844
3,4,0,0.234037
4,5,4,0.229689


# Mode Filter Method 20

In [None]:
filter_20 = list(filter_method.iloc[:20, 1])

In [None]:
X_train.iloc[:, filter_20].head()

In [None]:
trial_run = ensemble(X_train.iloc[:, filter_20], y_train, X_test.iloc[:, filter_20])

In [None]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_run, y_test).ravel()[i])

In [None]:
accuracy_score(trial_run, y_test)

In [None]:
precision_score(trial_run, y_test)

In [None]:
recall_score(trial_run, y_test)

In [None]:
f1_score(trial_run, y_test)

# Mode Filter Method 40

In [17]:
filter_40 = list(filter_method.iloc[:40, 1])

In [18]:
X_train.iloc[:, filter_40].head()

Unnamed: 0,Married-civ-spouse,Husband,Never-married,age,hours_per_week,Own-child,Female,Male,Exec-managerial,Not-in-family,...,Married-spouse-absent,United-States,Priv-house-serv,Local-gov,Other,Self-emp-not-inc,Amer-Indian-Eskimo,Protective-serv,Tech-support,Sales
0,0,0,1,0.03067,-0.035429,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
1,1,1,0,0.837096,-2.222119,0,0,1,1,0,...,0,1,0,0,0,1,0,0,0,0
2,0,0,0,-0.042641,-0.035429,0,0,1,0,1,...,0,1,0,0,0,0,0,0,0,0
3,1,1,0,1.057031,-0.035429,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,-0.775756,-0.035429,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
trial_40 = ensemble(X_train.iloc[:, filter_40], y_train, X_test.iloc[:, filter_40])

In [20]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_40, y_test).ravel()[i])

tn 11410
fp 1702
fn 1025
tp 2144


In [21]:
accuracy_score(trial_40, y_test)

0.83250414593698174

In [22]:
precision_score(trial_40, y_test)

0.55746229849193962

In [23]:
recall_score(trial_40, y_test)

0.67655411801830234

In [24]:
f1_score(trial_40, y_test)

0.6112615823235924

# Mode Filter Method 60

In [25]:
filter_60 = list(filter_method.iloc[:60, 1])

In [26]:
X_train.iloc[:, filter_60].head()

Unnamed: 0,Married-civ-spouse,Husband,Never-married,age,hours_per_week,Own-child,Female,Male,Exec-managerial,Not-in-family,...,State-gov,Taiwan,Nicaragua,Jamaica,Haiti,Peru,Craft-repair,Philippines,Germany,France
0,0,0,1,0.03067,-0.035429,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
1,1,1,0,0.837096,-2.222119,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,-0.042641,-0.035429,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1.057031,-0.035429,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,-0.775756,-0.035429,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
trial_60 = ensemble(X_train.iloc[:, filter_60], y_train, X_test.iloc[:, filter_60])

In [28]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_60, y_test).ravel()[i])

tn 11412
fp 1683
fn 1023
tp 2163


In [29]:
accuracy_score(trial_60, y_test)

0.83379399299797308

In [30]:
precision_score(trial_60, y_test)

0.56240249609984394

In [31]:
recall_score(trial_60, y_test)

0.67890772128060262

In [32]:
f1_score(trial_60, y_test)

0.6151877133105802