# Data transformation: Normalization/Standardization/Discretization

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

%matplotlib inline

### Load data

In [3]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')
salaryClassification[' capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']

salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)

test = pd.read_csv('data/test.csv', sep =';')


# Standardization

In [4]:
def standardScaling(X_train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(X_train[ftd])
    return scaled_data;

def robustScaling(X_train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(X_train[ftd])
    return scaled_data;


# Discretization

In [5]:
def discretize(X_train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    X_train[ftd] = discretizer.fit_transform(X_train[ftd])
    return X_train;

def discretize2(X_train, X_test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[ftd] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[ftd] = discretizer.transform(X_test[ftd])
    return X_train, X_test;

# Normalization

In [6]:
def normalize(X_train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    X_train = transformer = Normalizer().fit_transform(X_train[ftd])
    return X_train;

def normalize2(X_train, X_test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(X_train[ftd])
    X_test = normalizer.transform(X_test[ftd])
    return X_train, X_test;

## Técnica combinada

In [7]:
def discretizeAndScale(X_train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    X_train = discretize(X_train[ftd])
    X_train = robustScaling(X_train[ftd])
    return X_train;

def discretizeAndScale2(X_train, X_test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    X_train = discretize2(X_train[ftd], X_test[ftd])
    X_train = robustScaling2(X_train[ftd], X_test[ftd])
    return X_train, X_test;



# Avaliação

In [10]:
def evaluateTechnique(transformer):
    X_train = salaryClassification
    y_train =  salaryClassification[' salary-classification']

    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = ["Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
            


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [11]:
evaluateTechnique(standardScaling)

Accuracy: 0.806 (+/- 0.010) || AUROC 0.818 -> Logistic regression
Accuracy: 0.798 (+/- 0.004) || AUROC 0.781 -> SGDClassifier
Accuracy: 0.802 (+/- 0.004) || AUROC 0.785 -> KNearest Neighbors (5)
Accuracy: 0.822 (+/- 0.004) || AUROC 0.831 -> SVM-rbf
Accuracy: 0.805 (+/- 0.009) || AUROC 0.817 -> SMV-linear
Accuracy: 0.798 (+/- 0.006) || AUROC 0.836 -> Gaussian naive bayes
Accuracy: 0.814 (+/- 0.008) || AUROC 0.785 -> Decision Tree
Accuracy: 0.827 (+/- 0.005) || AUROC 0.854 -> Multi-layer Perceptron
Accuracy: 0.835 (+/- 0.003) || AUROC 0.861 -> AdaBoost
Accuracy: 0.820 (+/- 0.004) || AUROC 0.830 -> Random Forest
