# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

def preprocessing(dataset, _predictors, _class):
    
    if dataset == 'census':
        le = LabelEncoder()
        _predictors[:,1] = le.fit_transform(_predictors[:,1])
        _predictors[:,3] = le.fit_transform(_predictors[:,3])
        _predictors[:,5] = le.fit_transform(_predictors[:,5])
        _predictors[:,6] = le.fit_transform(_predictors[:,6])
        _predictors[:,7] = le.fit_transform(_predictors[:,7])
        _predictors[:,8] = le.fit_transform(_predictors[:,8])
        _predictors[:,9] = le.fit_transform(_predictors[:,9])
        _predictors[:,13] = le.fit_transform(_predictors[:,13])
        
        ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), 
                                             [1,3,5,6,7,8,9,13])],   
                                             remainder='passthrough')
        
        _predictors = ct.fit_transform(_predictors).toarray()
        _class = le.fit_transform(_class)

        scaler = StandardScaler()
        _predictors = scaler.fit_transform(_predictors)
        
    if dataset == 'credit_data':
        scaler = StandardScaler()
        _predictors = scaler.fit_transform(_predictors)
        
    return _predictors, _class

def data_acquisition(dataset):
    base = pd.read_csv('../dataset/' + dataset + '.csv')
    
    if dataset == 'census':
        _predictors = base.iloc[:, 0:14].values
        _class = base.iloc[:, 14].values
    
    if dataset == 'credit_data':
        base.loc[base.age < 0, 'age'] = base['age'][base.age > 0].mean()
        base.fillna(base.mean(), inplace=True)
        
        _predictors = base.iloc[:, 1:4].values
        _class = base.iloc[:, 4].values
        
    return preprocessing(dataset, _predictors, _class)

# Dimensionality reduction

In [2]:
from sklearn import decomposition
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def dim_reduction(method, _predictors, _class=None, n_components=6, kernel=None):
    if method == 'PCA':
        pca = decomposition.PCA(n_components = n_components)
        return pca.fit_transform(_predictors)
    
    elif method == 'KernelPCA':
        kpca = KernlePCA(n_components = n_components, kernel = kernel)
        return kpca.fit_transform(_predictors)
    
    elif method == 'LDA':
        lda = LinearDiscriminantAnalysis(n_components = n_components)
        return lda.fit_transform(_predictors, _class)
    
    else:
        return _predictors

# Dataset split and train

In [3]:
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
def train(model, _predictors, _class, split='nomal', test_size=None, random_state=0):
    
    if split == 'normal':
        
        _predictors_train, _predictors_test, _class_train, _class_test = model_selection.train_test_split(_predictors, 
                                                                                          _class, 
                                                                                          test_size=test_size, 
                                                                                          random_state=random_state)
        model.fit(_predictors_train, _class_train)
        predictions = model.predict(_predictors_test)
        accuracy = accuracy_score(_class_test, predictions)
        matrix = confusion_matrix(_class_test, predictions)
        
        return accuracy #, matrix

    if split == 'cross_validation':
        accuracy = model_selection.cross_val_score(model, _predictors, _class, cv = 10)
        
        return accuracy.mean()
    
    if split == 'stratified_fold':
        kfold = model_selection.StratifiedKFold(n_splits = 10, shuffle = True, random_state = random_state)

        accuracy = []
        matrices = []

        for train, test in kfold.split(_predictors, np.zeros(shape=(_predictors.shape[0], 1))):

            model.fit(_predictors[train], _class[train])
            predictions = model.predict(_predictors[test])
            accuracy.append(accuracy_score(_class[test], predictions))
            matrices.append(confusion_matrix(_class[test], predictions))

        return np.asarray(accuracy).mean() #, np.mean(matrices, axis = 0)

# Models

In [5]:
_predictors, _class = data_acquisition('credit_data')

#### ZeroR

In [6]:
import collections
counter = collections.Counter(_class)
base_line = counter[0] / (counter[0] + counter[1])

#### Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB
Naive_Bayes = GaussianNB()
Naive_Bayes_accuracy = train(Naive_Bayes, _predictors, _class, split = 'cross_validation')

#### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
Decision_Tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
Decision_Tree_accuracy = train(Decision_Tree, _predictors, _class, split = 'cross_validation')

#### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
Random_Forest = RandomForestClassifier(n_estimators = 40, criterion = 'entropy', random_state = 0)
Random_Forest_accuracy = train(Random_Forest, _predictors, _class, split = 'cross_validation')

#### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN_accuracy = train(KNN, _predictors, _class, split = 'cross_validation')

#### Regressão Logística

In [11]:
from sklearn.linear_model import LogisticRegression
Logistic_Regression = LogisticRegression(random_state = 0)
Logistic_Regression_accuracy = train(Logistic_Regression, _predictors, _class, split = 'cross_validation')

#### SVM

In [12]:
from sklearn.svm import SVC
SVM = SVC(kernel = 'linear', random_state = 0)
SVM_accuracy = train(SVM, _predictors, _class, split = 'cross_validation')

#### Neural Networks

In [13]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(verbose = False, 
                    max_iter = 2000, 
                    tol = 0.000010, 
                    solver = 'adam', 
                    hidden_layer_sizes = (100),
                    activation = 'relu')
MLP_accuracy = train(MLP, _predictors, _class, split = 'normal')

# Results

In [14]:
RED = "\033[1;31m" 
RESET = '\033[0;0m'
NEGRITO = '\033[1m'
models = {'Base Line': base_line,'Naive Bayes': Naive_Bayes_accuracy, 'Decision Tree': Decision_Tree_accuracy,
          'Random Forest': Random_Forest_accuracy, 'KNN': KNN_accuracy, 'Logistic Regression': Logistic_Regression_accuracy,
          'SVM': SVM_accuracy, 'MLP': MLP_accuracy}

for model in models:
    print('{}{}{}: {}{:.4f}%{}'.format(NEGRITO,model,RESET,RED,models[model], RESET))

[1mBase Line[0;0m: [1;31m0.8585%[0;0m
[1mNaive Bayes[0;0m: [1;31m0.9240%[0;0m
[1mDecision Tree[0;0m: [1;31m0.9870%[0;0m
[1mRandom Forest[0;0m: [1;31m0.9865%[0;0m
[1mKNN[0;0m: [1;31m0.9800%[0;0m
[1mLogistic Regression[0;0m: [1;31m0.9480%[0;0m
[1mSVM[0;0m: [1;31m0.9475%[0;0m
[1mMLP[0;0m: [1;31m0.9960%[0;0m
