In [1]:
import sys
sys.path.append('../')
from Modules import AdaBoost, KNN, DecisionTree, PolyLogisticRegression, LogisticRegression, MLP, RandomForest, SVM
import pandas as pd
import numpy as np
import time

In [2]:
"""
Importing dataset. 
We can try differnt preprocessed datasets here (Ex. PCA, SVD and etc).
Please comment out other datasets except the desired one. 
"""
# original
train = pd.read_csv('../Data/adult_train_preprocessed.csv')
test = pd.read_csv('../Data/adult_test_preprocessed.csv')

# # PCA Variance
# train = pd.read_csv('../Data/train_PCA_variance.csv')
# test = pd.read_csv('../Data/test_PCA_variance.csv')

# # PCA Manual
# train = pd.read_csv('../Data/train_PCA_manual.csv')
# test = pd.read_csv('../Data/test_PCA_manual.csv')

# # Factor Analysis
# train = pd.read_csv('../Data/train_FA.csv')
# test = pd.read_csv('../Data/test_FA.csv')

# # SVD
# train = pd.read_csv('../Data/train_SVD.csv')
# test = pd.read_csv('../Data/test_SVD.csv')

In [3]:
"""
Make model instances here.
"""
ab = AdaBoost.AB_model(train,test,flatten=False)
dt = DecisionTree.DT_model(train,test,flatten=False)
knn = KNN.KNN_model(train,test,flatten=False)
lr = LogisticRegression.LR_model(train,test,flatten=False)
plr = PolyLogisticRegression.PLR_model(train,test,flatten=False)
mlp = MLP.MLP_model(train,test,flatten=False)
rf = RandomForest.RF_model(train,test,flatten=False)
svm = SVM.SVM_model(train,test,prob=True,flatten=False)

models = {'ab':ab,'dt':dt,'knn':knn,'lr':lr,'plr':plr,'mlp':mlp,'rf':rf,'svm':svm} # all models
print(models)

{'ab': <Modules.AdaBoost.AB_model object at 0x1a21636410>, 'dt': <Modules.DecisionTree.DT_model object at 0x1a215e6d10>, 'knn': <Modules.KNN.KNN_model object at 0x1a2160aa90>, 'lr': <Modules.LogisticRegression.LR_model object at 0x1a21636510>, 'plr': <Modules.PolyLogisticRegression.PLR_model object at 0x1a21636550>, 'mlp': <Modules.MLP.MLP_model object at 0x1a21636590>, 'rf': <Modules.RandomForest.RF_model object at 0x1a216365d0>, 'svm': <Modules.SVM.SVM_model object at 0x1a1fdba610>}


In [4]:
%%time

# Find the optimal models and print test scores
params = {}
results = {}

"""
Optimal params for default dataset.
"""
params['mlp'] = {'hidden_layer_sizes':[(64,32,16,8)],
                'learning_rate_init':[0.001],
                'activation':['tanh'],
                'alpha':[0.001],
                'batch_size':[256]}
params['lr'] = {"C":[0.8], "max_iter":[10000]} # default solver lbfgs only supports l2 penalties according to sklearn doc
params['plr'] = {"C":[0.6], "max_iter":[10000]} # default solver lbfgs only supports l2 penalties according to sklearn doc
params['rf']={'max_depth': [15],
              'criterion': ["entropy"],
              'min_samples_split': [2],
              'max_features': [10]}
params['svm'] = {"coef0": [0.0],
                 "kernel":['rbf']}
params['ab'] = {'n_estimators':[200], 'learning_rate':[1]}
params['knn'] = {'n_neighbors':[20] }
params['dt'] = {'criterion': ['entropy'], 'max_depth': [10], 'min_samples_split': [2], 'splitter': ['best']}

for k in models.keys():
    start = time.time()
    models[k].optimize_model(params[k])
    models[k].get_test_score()
    results[k] = models[k].test_score
    end = time.time()
    print('==============================================================================================================================')
    print(f'{k}: {results[k]}')
    print(f'best_param: {models[k].best_param}')
    print(f'{k} took {end-start} seconds')

ab: {'accuracy': 0.8606241699867198, 'precision': 0.7601559961000975, 'recall': 0.6321621621621621, 'f1_score': 0.6902759333038218}
best_param: {'learning_rate': 1, 'n_estimators': 200}
ab took 23.52608370780945 seconds
dt: {'accuracy': 0.8529880478087649, 'precision': 0.7686189443239335, 'recall': 0.5745945945945946, 'f1_score': 0.6575935663470462}
best_param: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2, 'splitter': 'best'}
dt took 0.5689668655395508 seconds
knn: {'accuracy': 0.8387782204515273, 'precision': 0.7172131147540983, 'recall': 0.5675675675675675, 'f1_score': 0.6336753168376583}
best_param: {'n_neighbors': 20}
knn took 50.72565197944641 seconds
lr: {'accuracy': 0.8440239043824701, 'precision': 0.7252417472490831, 'recall': 0.5878378378378378, 'f1_score': 0.6493506493506495}
best_param: {'C': 0.8, 'max_iter': 10000}
lr took 2.2353689670562744 seconds
plr: {'accuracy': 0.8487383798140771, 'precision': 0.7210820895522388, 'recall': 0.6267567567567568, 'f1_s

In [5]:
"""
Get a list of weights based on accuracy. Can be any other scores.
"""

accuracy_list = [v['accuracy'] for k,v in results.items()]
weights = np.array(accuracy_list)/np.sum(accuracy_list)
print('All models....')
print(f'The order of models are : {[k for k in results.keys()]}')
print(f'Their accuracies are : {accuracy_list}')
print(f'Thus, the weights are : {weights}\n')

All models....
The order of models are : ['ab', 'dt', 'knn', 'lr', 'plr', 'mlp', 'rf', 'svm']
Their accuracies are : [0.8606241699867198, 0.8529880478087649, 0.8387782204515273, 0.8440239043824701, 0.8487383798140771, 0.8519920318725099, 0.8583665338645419, 0.850929614873838]
Thus, the weights are : [0.12644261 0.12532072 0.12323301 0.12400371 0.12469636 0.12517438
 0.12611092 0.12501829]



### Applying Ensemble method (VotingClassifier)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

best_estimators = [(k,models[k].best_model) for k in models.keys()]
X_train = train.values[:,:-1]
y_train = train.values[:,-1]
X_test = test.values[:,:-1]
y_test = test.values[:,-1].astype(int)
ensemble_models = {}
ensemble_preds = {}
ensemble_scores = {}

print(best_estimators)

[('ab', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=200, random_state=None)), ('dt', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')), ('lr', LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',


In [7]:
%%time

"""
Final ensemble model - soft voting with uniform weight.
"""

model_name = 'soft_uniform'
start = time.time()
ensemble_models[model_name] = VotingClassifier(estimators=best_estimators,voting='soft',weights=None,n_jobs=-1)
ensemble_models[model_name] = ensemble_models[model_name].fit(X_train,y_train)
ensemble_preds[model_name] = ensemble_models[model_name].predict(X_test)
ensemble_scores[model_name] = {'accuracy':accuracy_score(y_test,ensemble_preds[model_name]),
                               'precision':precision_score(y_test,ensemble_preds[model_name]),
                               'recall':recall_score(y_test,ensemble_preds[model_name]),
                               'f1_score':f1_score(y_test,ensemble_preds[model_name])}
end = time.time()
print('===============================================================================================================')
print(f'{model_name} result: {ensemble_scores[model_name]}')
print(f'{model_name} took {end-start} seconds')

soft_uniform result: {'accuracy': 0.8567065073041169, 'precision': 0.7695804195804196, 'recall': 0.5948648648648649, 'f1_score': 0.6710365853658536}
soft_uniform took 203.2453351020813 seconds
CPU times: user 20 s, sys: 551 ms, total: 20.6 s
Wall time: 3min 23s
