# Experiments

## Ensembling two classifiers 


1. Normalizacja wsparć poszczególnych klasyfikatorów?


In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer, load_wine, load_digits, load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn import preprocessing


In [2]:
class Dataset(object):
    def __init__(self, data, target):
        self.data = data
        self.target = target

def load_mammographic(): 
    columns = ['bi_rads', 'age', 'shape', 'margin', 'density', 'class']

    mammographic = pd.read_csv('datasets/mammographic_masses.data', names=columns)
    mammographic = mammographic.replace('?', np.nan)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_abalone(): 
    columns = ['sex', 'len', 'dia', 'h', 'ww', 'sw', 'vw', 'shw', 'class']

    dataset = pd.read_csv('datasets/abalone.data', names=columns)
    dataset = dataset.replace('M', 0)
    dataset = dataset.replace('F', 1)
    dataset = dataset.replace('I', 2)
#     mammographic = mammographic.replace('?', np.nan)
#     imp = SimpleImputer(missing_values=np.nan, strategy='median')
#     dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_vihno_verde(): 

    dataset = pd.read_csv('datasets/winequality-white.csv', header=0, sep=';')
#     mammographic = mammographic.replace('?', np.nan)
#     imp = SimpleImputer(missing_values=np.nan, strategy='median')
#     dataset = pd.DataFrame(imp.fit_transform(mammographic), columns=columns)

    y = dataset['quality'].values.astype(int)
    X = dataset.drop('quality', axis=1).values
    return Dataset(X, y)


def load_balance(): 
    columns = ['class', 'lw', 'ld', 'rw', 'rd']
    dataset = pd.read_csv('datasets/balance-scale.data', names=columns)
    
    dataset = dataset.replace('L', 0)
    dataset = dataset.replace('B', 1)
    dataset = dataset.replace('R', 2)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_yeast(): 
    columns = ['name','xz','mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'class']
    dataset = pd.read_csv('datasets/yeast.data', names=columns, delim_whitespace=True)
    dataset = dataset.drop('name', axis=1)
    
    dataset = dataset.replace('CYT', 0)
    dataset = dataset.replace('NUC', 1)
    dataset = dataset.replace('MIT', 2)
    dataset = dataset.replace('ME3', 3)
    dataset = dataset.replace('ME2', 4)
    dataset = dataset.replace('ME1', 5)
    dataset = dataset.replace('EXC', 6)
    dataset = dataset.replace('VAC', 7)
    dataset = dataset.replace('POX', 8)
    dataset = dataset.replace('ERL', 9)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_biodeg(): 
    columns = [x for x in range(41)]
    columns.append('class')
    dataset = pd.read_csv('datasets/biodeg.csv', names=columns, sep=";")
    
    dataset = dataset.replace('RB', 0)
    dataset = dataset.replace('NRB', 1)

    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_tic_tac(): 
    columns = [x for x in range(9)]
    columns.append('class')
    dataset = pd.read_csv('datasets/tic-tac-toe.data', names=columns)
    
    dataset = dataset.replace('x', 3)
    dataset = dataset.replace('o', 4)
    dataset = dataset.replace('b', 5)
    
    dataset = dataset.replace('negative', 0)
    dataset = dataset.replace('positive', 1)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_segmentation(): 
    columns = ['class'] + [x for x in range(19)]
    dataset = pd.read_csv('datasets/segmentation.test', names=columns)
    
    dataset = dataset.replace('BRICKFACE', 1)
    dataset = dataset.replace('SKY', 2)
    dataset = dataset.replace('FOLIAGE', 3)
    dataset = dataset.replace('CEMENT', 4)
    dataset = dataset.replace('WINDOW', 5)
    dataset = dataset.replace('PATH', 6)
    dataset = dataset.replace('GRASS', 7)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_phishing_websites(): 
    columns = ['id'] + [x for x in range(30)] + ['class']
    dataset = pd.read_csv('datasets/phishing-websites.csv', names=columns)
    dataset = dataset.sample(100)
    
    dataset['class'] = dataset['class'].replace(-1, 0)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    return Dataset(X, y)

def load_indian_liver(): 
    columns = [x for x in range(10)] + ['class']
    dataset = pd.read_csv('datasets/indian-liver.csv', names=columns)
    dataset = dataset.replace('Male', 0)
    dataset = dataset.replace('Female', 1)

    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    dataset = pd.DataFrame(imp.fit_transform(dataset), columns=columns)
    
    y = dataset['class'].values.astype(int)
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_diabetic_retinppathy(): 
    dataset = pd.read_csv('datasets/diabetic-retinopathy.csv', header=0)
    dataset = dataset.drop('id', axis=1)
    
    y = dataset['Class'].values.astype(int)
    X = dataset.drop('Class', axis=1).values
    
    return Dataset(X, y)

def load_thoracic_surgery(): 
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    dataset = pd.read_csv('datasets/thoracic-surgery.csv', header=0)
    dataset = dataset.drop('id', axis=1)
    
    y = le.fit_transform(dataset['Risk1Yr'].values)
    X = dataset.drop('Risk1Yr', axis=1).values
    X = encoder.fit_transform(X)
    
    return Dataset(X, y)

def load_house_votes_84(): 
    columns = ['class'] + [x for x in range(17)]
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    dataset = pd.read_csv('datasets/house-votes-84.data', names=columns)
    dataset = dataset.replace('?', np.nan)
    
    y = le.fit_transform(dataset['class'].values)
    
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    X = encoder.fit_transform(X)
    
    return Dataset(X, y)

def load_australian():
    columns = [x for x in range(14)] + ['class']
    dataset = pd.read_csv('datasets/australian.dat', names=columns, sep=" ")
    
    y = dataset['class'].values
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_ecoli():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    columns = [x for x in range(8)] + ['class']
    dataset = pd.read_csv('datasets/ecoli.data', names=columns, delim_whitespace=True)
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = encoder.fit_transform(X)
    
    return Dataset(X, y)

def load_ionoshpere():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    columns = [x for x in range(34)] + ['class']
    dataset = pd.read_csv('datasets/ionosphere.data', names=columns)
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_glasss():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    columns = ['id'] + [x for x in range(9)] + ['class']
    dataset = pd.read_csv('datasets/glass.data', names=columns)
    dataset = dataset.drop('id', axis=1)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_haberman():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    columns = [x for x in range(3)] + ['class']
    dataset = pd.read_csv('datasets/haberman.data', names=columns)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_letter_recognition():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    columns = ['class'] + [x for x in range(16)]
    dataset = pd.read_csv('datasets/letter-recognition.data', names=columns)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    
    return Dataset(X, y)

def load_horse_colic():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    columns = [x for x in range(23)] + ['class'] + [25, 26, 27]
    dataset = pd.read_csv('datasets/horse-colic.data', names=columns, delim_whitespace=True)
    dataset = dataset.replace('?', np.nan)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    
    return Dataset(X, y)

def load_heart_statlog():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    columns = [x for x in range(13)] + ['class']
    dataset = pd.read_csv('datasets/heart.data', names=columns, delim_whitespace=True)
    dataset = dataset.replace('?', np.nan)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    
    return Dataset(X, y)

def load_hepatitis():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    columns = ['class'] + [x for x in range(19)]
    dataset = pd.read_csv('datasets/hepatitis.data', names=columns)
    dataset = dataset.replace('?', np.nan)
    
    print(dataset)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    
    return Dataset(X, y)

def load_connectionist_bench():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    columns = [x for x in range(60)] + ['class']
    dataset = pd.read_csv('datasets/sonar.data', names=columns)
    dataset = dataset.replace('?', np.nan)
    
    print(dataset)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    
    return Dataset(X, y)

def load_waveform():
    encoder = preprocessing.OrdinalEncoder()
    le = preprocessing.LabelEncoder()
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    columns = [x for x in range(40)] + ['class']
    dataset = pd.read_csv('datasets/waveform.data', names=columns)
    dataset = dataset.replace('?', np.nan)
    
    print(dataset)
    
    y = le.fit_transform(dataset['class'].values)
    X = dataset.drop('class', axis=1).values
    X = imp.fit_transform(X)
    
    return Dataset(X, y)

In [3]:
# dataset = load_breast_cancer()
dataset = load_waveform()
X = dataset.data
y = dataset.target

X

         0     1     2     3     4     5     6     7     8     9  ...      31  \
0    -0.23 -1.21  1.20  1.23 -0.10  0.12  2.49  1.19  1.34  0.58  ...   -0.86   
1     0.38  0.38 -0.31 -0.09  1.52  1.35  1.49  3.81  2.33  1.34  ...    1.28   
2    -0.69  1.00  1.08  1.48  2.44  3.39  3.09  4.08  5.48  3.61  ...    0.29   
3     0.40  0.68  0.27  1.39  1.03 -0.32 -1.23 -0.50  0.11  0.87  ...    0.43   
4    -0.81  1.59 -0.69  1.16  4.22  4.98  4.52  2.54  5.60  4.66  ...    0.62   
5     0.59  0.77 -0.61  1.00  1.80  2.08  2.16  3.59  4.08  3.63  ...    1.23   
6    -0.15  0.13  2.27  2.39  4.00  6.14  5.36  4.08  3.81  3.89  ...    0.25   
7    -0.30 -0.42  0.25 -0.61 -1.39 -0.60  1.71  4.01  2.96  5.81  ...   -0.00   
8    -1.45  2.71  3.04  3.21  4.26  5.01  6.24  5.09  3.95  4.84  ...   -1.18   
9     0.28  0.97 -1.01 -2.34 -1.89  0.54  0.05  2.05  2.38  3.66  ...   -0.12   
10   -1.09 -0.44  1.15  0.17  2.10  3.77  2.40  5.16  5.13  3.66  ...   -0.24   
11    0.50 -1.23 -0.09  0.31

array([[-0.23, -1.21,  1.2 , ...,  0.56, -0.53,  0.29],
       [ 0.38,  0.38, -0.31, ...,  0.91, -0.79,  0.22],
       [-0.69,  1.  ,  1.08, ..., -0.09, -1.33,  1.  ],
       ...,
       [ 0.64,  0.81, -0.38, ..., -0.38, -0.7 , -0.85],
       [ 0.18,  1.65,  1.91, ..., -0.63, -0.92,  0.63],
       [ 2.05, -1.99,  1.66, ...,  1.01,  0.06,  0.61]])

In [4]:
class Ensemble(object):
    
    def __init__(self, base_classifiers):
        self.base_classifiers = base_classifiers
        self.meta_classifier = MLPClassifier(max_iter=1000)
        
    def fit_base_classfiers(self, X, y):
        for classifier in self.base_classifiers:
            classifier.fit(X,y)
            
    def fit_meta_classifier_supports(self, X, y):
        supports = self.get_supports(X)
        self.meta_classifier = MLPClassifier(max_iter=1000)
        self.meta_classifier.fit(supports, y)
        
    def fit_meta_classifier_features(self, X, y, ):
        supports = self.get_supports(X)
        supports_with_features = np.concatenate((supports, X), axis=1)
        self.meta_classifier = MLPClassifier(max_iter=10000)
        self.meta_classifier.fit(supports_with_features, y)
        
    def predict_meta(self, X):
        supports = self.get_supports(X)
        return self.meta_classifier.predict(supports)

    def predict_meta_features(self, X):
        supports = self.get_supports(X)
        supports = np.concatenate((supports, X), axis=1)
        return self.meta_classifier.predict(supports)
        
    def get_supports(self, X):
        temp = []
        for classifier in self.base_classifiers:
            if temp == []:
                temp = classifier.predict_proba(X)
            else:
                temp = np.concatenate((temp, classifier.predict_proba(X)), axis=1)
        return temp

In [5]:
estimators = [
    SVC(probability=True, kernel='linear'),
#     SVC(probability=True, kernel='rbf', gamma='auto'),
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    MLPClassifier(max_iter=1000),
#     MultinomialNB()
]
ens = Ensemble(estimators)

bc_names = ['svm_lin', 'knn3', 'knn5', 'mlp']
voting_bc = list(zip(bc_names, estimators))
voitng_clf = VotingClassifier(voting_bc)

In [6]:
results = pd.DataFrame(columns=['no_features', 'features', 'voting', 'svm_lin', 'knn3', 'knn5', 'mlp'])

kf = StratifiedKFold(n_splits=10, random_state=5, shuffle=True)
for train_index, test_index in kf.split(X, y):
    train_base, train_ensemble = np.split(train_index, [int(.5*len(train_index))])
    
    X_train = X[train_index]
    y_train = y[train_index]
    ens.fit_base_classfiers(X_train, y_train)
    
    X_ens = X[train_index]
    y_ens = y[train_index]
    ens.fit_meta_classifier_supports(X_ens, y_ens)
    
    X_test = X[test_index]
    y_test = y[test_index]
    ens_score = accuracy_score(ens.predict_meta(X_test), y_test)
    
    ens.fit_meta_classifier_features(X_ens, y_ens)
    ens_score_features = accuracy_score(ens.predict_meta_features(X_test), y_test)
    
    voitng_clf.fit(X_train, y_train)
    voting = accuracy_score(voitng_clf.predict(X_test), y_test)
    
    svm_lin = accuracy_score(ens.base_classifiers[0].predict(X_test), y_test)
#     svm_rbf = accuracy_score(ens.base_classifiers[1].predict(X_test), y_test)
    knn3 = accuracy_score(ens.base_classifiers[1].predict(X_test), y_test)
    knn5 = accuracy_score(ens.base_classifiers[2].predict(X_test), y_test)
    mlp = accuracy_score(ens.base_classifiers[3].predict(X_test), y_test)
#     nb = accuracy_score(ens.base_classifiers[5].predict(X_test), y_test)
    
    
    results.loc[len(results)] = [ens_score, ens_score_features, voting, svm_lin, knn3, knn5, mlp]







In [7]:
for cls in ens.base_classifiers:
    print(accuracy_score(cls.predict(X_test), y_test))

0.8537074148296593
0.7675350701402806
0.8136272545090181
0.8236472945891784


In [8]:
results

Unnamed: 0,no_features,features,voting,svm_lin,knn3,knn5,mlp
0,0.848606,0.850598,0.830677,0.878486,0.798805,0.810757,0.838645
1,0.830677,0.830677,0.816733,0.864542,0.790837,0.804781,0.830677
2,0.814371,0.812375,0.824351,0.872255,0.794411,0.806387,0.816367
3,0.822,0.822,0.834,0.878,0.798,0.812,0.83
4,0.838,0.842,0.824,0.86,0.782,0.782,0.836
5,0.831663,0.833667,0.849699,0.875752,0.799599,0.803607,0.827655
6,0.851703,0.849699,0.853707,0.865731,0.805611,0.817635,0.849699
7,0.817635,0.819639,0.835671,0.841683,0.763527,0.803607,0.817635
8,0.825651,0.825651,0.827655,0.851703,0.793587,0.817635,0.821643
9,0.831663,0.829659,0.829659,0.853707,0.767535,0.813627,0.823647


In [9]:
results.mean()

no_features    0.831197
features       0.831597
voting         0.832615
svm_lin        0.864186
knn3           0.789391
knn5           0.807204
mlp            0.829197
dtype: float64

In [10]:
results.max()

no_features    0.851703
features       0.850598
voting         0.853707
svm_lin        0.878486
knn3           0.805611
knn5           0.817635
mlp            0.849699
dtype: float64

In [11]:
results.min()

no_features    0.814371
features       0.812375
voting         0.816733
svm_lin        0.841683
knn3           0.763527
knn5           0.782000
mlp            0.816367
dtype: float64

In [12]:
results.median()

no_features    0.831170
features       0.830168
voting         0.830168
svm_lin        0.865137
knn3           0.793999
knn5           0.808572
mlp            0.828828
dtype: float64