# Experiments

## Ensembling two classifiers 


1. Normalizacja wsparć poszczególnych klasyfikatorów?


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer, load_wine, load_digits, load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer


In [None]:
class Dataset(object):
    def __init__(self, data, target):
        self.data = data
        self.target = target

def load_mammographic(): 
    columns = ['bi_rads', 'age', 'shape', 'margin', 'density', 'class']

    mammographic = pd.read_csv('datasets/mammographic_masses.data', names=columns)
    mammographic = mammographic.replace('?', np.nan)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_abalone(): 
    columns = ['sex', 'len', 'dia', 'h', 'ww', 'sw', 'vw', 'shw', 'class']

    dataset = pd.read_csv('datasets/abalone.data', names=columns)
    dataset = dataset.replace('M', 0)
    dataset = dataset.replace('F', 1)
    dataset = dataset.replace('I', 2)
#     mammographic = mammographic.replace('?', np.nan)
#     imp = SimpleImputer(missing_values=np.nan, strategy='median')
#     dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_vihno_verde(): 

    dataset = pd.read_csv('datasets/winequality-white.csv', header=0, sep=';')
#     mammographic = mammographic.replace('?', np.nan)
#     imp = SimpleImputer(missing_values=np.nan, strategy='median')
#     dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['quality'].values.astype(int)
    X = dataset.drop('quality', axis=1).values
    return Dataset(X, y)


def load_balance(): 
    columns = ['class', 'lw', 'ld', 'rw', 'rd']
    dataset = pd.read_csv('datasets/balance-scale.data', names=columns)
    
    dataset = dataset.replace('L', 0)
    dataset = dataset.replace('B', 1)
    dataset = dataset.replace('R', 2)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_yeast(): 
    columns = ['name','xz','mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'class']
    dataset = pd.read_csv('datasets/yeast.data', names=columns, delim_whitespace=True)
    dataset = dataset.drop('name', axis=1)
    
    dataset = dataset.replace('CYT', 0)
    dataset = dataset.replace('NUC', 1)
    dataset = dataset.replace('MIT', 2)
    dataset = dataset.replace('ME3', 3)
    dataset = dataset.replace('ME2', 4)
    dataset = dataset.replace('ME1', 5)
    dataset = dataset.replace('EXC', 6)
    dataset = dataset.replace('VAC', 7)
    dataset = dataset.replace('POX', 8)
    dataset = dataset.replace('ERL', 9)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_biodeg(): 
    columns = [x for x in range(41)]
    columns.append('class')
    dataset = pd.read_csv('datasets/biodeg.csv', names=columns, sep=";")
    
    dataset = dataset.replace('RB', 0)
    dataset = dataset.replace('NRB', 1)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_tic_tac(): 
    columns = [x for x in range(9)]
    columns.append('class')
    dataset = pd.read_csv('datasets/tic-tac-toe.data', names=columns)
    
    dataset = dataset.replace('x', 3)
    dataset = dataset.replace('o', 4)
    dataset = dataset.replace('b', 5)
    
    dataset = dataset.replace('negative', 0)
    dataset = dataset.replace('positive', 1)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_segmentation(): 
    columns = ['class'] + [x for x in range(19)]
    dataset = pd.read_csv('datasets/segmentation.test', names=columns)
    
    dataset = dataset.replace('BRICKFACE', 1)
    dataset = dataset.replace('SKY', 2)
    dataset = dataset.replace('FOLIAGE', 3)
    dataset = dataset.replace('CEMENT', 4)
    dataset = dataset.replace('WINDOW', 5)
    dataset = dataset.replace('PATH', 6)
    dataset = dataset.replace('GRASS', 7)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_phishing_websites(): 
    columns = ['id'] + [x for x in range(30)] + ['class']
    dataset = pd.read_csv('datasets/phishing-websites.csv', names=columns)
    
    dataset['class'] = dataset['class'].replace(-1, 0)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

In [None]:
# dataset = load_breast_cancer()
dataset = load_phishing_websites()
X = dataset.data
y = dataset.target


In [None]:
class Ensemble(object):
    
    def __init__(self, base_classifiers):
        self.base_classifiers = base_classifiers
        self.meta_classifier = MLPClassifier(max_iter=1000)
        
    def fit_base_classfiers(self, X, y):
        for classifier in self.base_classifiers:
            classifier.fit(X,y)
            
    def fit_meta_classifier_supports(self, X, y):
        supports = self.get_supports(X)
        self.meta_classifier = MLPClassifier(max_iter=1000)
        self.meta_classifier.fit(supports, y)
        
    def fit_meta_classifier_features(self, X, y, ):
        supports = self.get_supports(X)
        supports_with_features = np.concatenate((supports, X), axis=1)
        self.meta_classifier = MLPClassifier(max_iter=10000)
        self.meta_classifier.fit(supports_with_features, y)
        
    def predict_meta(self, X):
        supports = self.get_supports(X)
        return self.meta_classifier.predict(supports)

    def predict_meta_features(self, X):
        supports = self.get_supports(X)
        supports = np.concatenate((supports, X), axis=1)
        return self.meta_classifier.predict(supports)
        
    def get_supports(self, X):
        temp = []
        for classifier in self.base_classifiers:
            if temp == []:
                temp = classifier.predict_proba(X)
            else:
                temp = np.concatenate((temp, classifier.predict_proba(X)), axis=1)
        return temp

In [None]:
estimators = [
    SVC(probability=True, kernel='linear'),
#     SVC(probability=True, kernel='rbf', gamma='auto'),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    MLPClassifier(max_iter=1000),
#     MultinomialNB()
]
ens = Ensemble(estimators)

bc_names = ['svm_lin', 'knn3', 'knn5', 'mlp']
voting_bc = list(zip(bc_names, estimators))
voitng_clf = VotingClassifier(voting_bc)

In [None]:
results = pd.DataFrame(columns=['no_features', 'features', 'voting', 'svm_lin', 'knn3', 'knn5', 'mlp'])

kf = StratifiedKFold(n_splits=100, random_state=5, shuffle=True)
for train_index, test_index in kf.split(X, y):
    train_base, train_ensemble = np.split(train_index, [int(.5*len(train_index))])
    
    X_train = X[train_index]
    y_train = y[train_index]
    ens.fit_base_classfiers(X_train, y_train)
    
    X_ens = X[train_index]
    y_ens = y[train_index]
    ens.fit_meta_classifier_supports(X_ens, y_ens)
    
    X_test = X[test_index]
    y_test = y[test_index]
    ens_score = accuracy_score(ens.predict_meta(X_test), y_test)
    
    ens.fit_meta_classifier_features(X_ens, y_ens)
    ens_score_features = accuracy_score(ens.predict_meta_features(X_test), y_test)
    
    voitng_clf.fit(X_train, y_train)
    voting = accuracy_score(voitng_clf.predict(X_test), y_test)
    
    svm_lin = accuracy_score(ens.base_classifiers[0].predict(X_test), y_test)
#     svm_rbf = accuracy_score(ens.base_classifiers[1].predict(X_test), y_test)
    knn3 = accuracy_score(ens.base_classifiers[1].predict(X_test), y_test)
    knn5 = accuracy_score(ens.base_classifiers[2].predict(X_test), y_test)
    mlp = accuracy_score(ens.base_classifiers[3].predict(X_test), y_test)
#     nb = accuracy_score(ens.base_classifiers[5].predict(X_test), y_test)
    
    
    results.loc[len(results)] = [ens_score, ens_score_features, voting, svm_lin, knn3, knn5, mlp]

In [None]:
for cls in ens.base_classifiers:
    print(accuracy_score(cls.predict(X_test), y_test))

In [None]:
results

In [None]:
results.mean()

In [None]:
results.max()

In [None]:
results.min()

In [None]:
results.median()