# Data transformation: Normalization/Standardization/Discretization

### Import libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder

from sklearn.exceptions import ConvergenceWarning

%matplotlib inline

### Load data

In [7]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

#treat missing values
salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["workclass"] = oe.fit_transform(salaryClassification[[" workclass"]]).astype(int)
salaryClassification = salaryClassification.drop(' workclass', 1)

#fix column names
salaryClassification['fnlwgt'] = salaryClassification[' fnlwgt']
salaryClassification = salaryClassification.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["education"] = oe.fit_transform(salaryClassification[[" education"]]).astype(int)
salaryClassification = salaryClassification.drop(' education', 1)

#fix column names
salaryClassification['education-num'] = salaryClassification[' education-num']
salaryClassification = salaryClassification.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["marital-status"] = oe.fit_transform(salaryClassification[[" marital-status"]]).astype(int)
salaryClassification = salaryClassification.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["occupation"] = oe.fit_transform(salaryClassification[[" occupation"]]).astype(int)
salaryClassification = salaryClassification.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["relationship"] = oe.fit_transform(salaryClassification[[" relationship"]]).astype(int)
salaryClassification = salaryClassification.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["race"] = oe.fit_transform(salaryClassification[[" race"]]).astype(int)
salaryClassification = salaryClassification.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["sex"] = oe.fit_transform(salaryClassification[[" sex"]]).astype(int)
salaryClassification = salaryClassification.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
salaryClassification['capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']
salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)

#fix column names
salaryClassification['hours-per-week'] = salaryClassification[' hours-per-week']
salaryClassification = salaryClassification.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["native-county"] = oe.fit_transform(salaryClassification[[" native-country"]]).astype(int)
salaryClassification = salaryClassification.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
salaryClassification[' salary-classification'] = [x.replace(' <=50K', '0') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = [x.replace(' >50K', '1') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = salaryClassification[' salary-classification'].astype(int)

salaryClassification['salary-classification'] = salaryClassification[' salary-classification']
salaryClassification = salaryClassification.drop(' salary-classification',1)



## teste 

testData = pd.read_csv('data/test.csv', sep = ';')

#treat missing values
testData = testData.replace(' ?', np.NaN)
testData[' workclass'] = testData[' workclass'].replace(np.NaN, 'Unknown')
testData[' occupation'] = testData[' occupation'].replace(np.NaN, 'Other')
testData[' native-country'] = testData[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
testData["workclass"] = oe.fit_transform(testData[[" workclass"]]).astype(int)
testData = testData.drop(' workclass', 1)

#fix column name
testData['fnlwgt'] = testData[' fnlwgt']
testData = testData.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["education"] = oe.fit_transform(testData[[" education"]]).astype(int)
testData = testData.drop(' education', 1)

#fix column name
testData['education-num'] = testData[' education-num']
testData = testData.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["marital-status"] = oe.fit_transform(testData[[" marital-status"]]).astype(int)
testData = testData.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
testData["occupation"] = oe.fit_transform(testData[[" occupation"]]).astype(int)
testData = testData.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
testData["relationship"] = oe.fit_transform(testData[[" relationship"]]).astype(int)
testData = testData.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
testData["race"] = oe.fit_transform(testData[[" race"]]).astype(int)
testData = testData.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
testData["sex"] = oe.fit_transform(testData[[" sex"]]).astype(int)
testData = testData.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
testData['capital-diff'] = testData[' capital-gain'] - testData[' capital-loss']
testData = testData.drop(' capital-gain', 1)
testData = testData.drop(' capital-loss', 1)

#fix column name
testData['hours-per-week'] = testData[' hours-per-week']
testData = testData.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
testData["native-county"] = oe.fit_transform(testData[[" native-country"]]).astype(int)
testData = testData.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
testData[' salary-classification'] = [x.replace(' <=50K', '0') for x in testData[' salary-classification']]
testData[' salary-classification'] = [x.replace(' >50K', '1') for x in testData[' salary-classification']]
testData[' salary-classification'] = testData[' salary-classification'].astype(int)

testData['salary-classification'] = testData[' salary-classification']
testData = testData.drop(' salary-classification',1)

# Standardization

In [1]:
def standardScaling(train):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train)
    return train;


def robustScaling(train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train)
    return train;

# Discretization

In [4]:
def discretize(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    return train;

# Normalization

In [5]:
def normalize(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    train[ftd] = transformer = Normalizer().fit_transform(train[ftd])
    return train;

In [None]:
def discretizeAndScale(X_train):
    X_train = discretize(X_train)
    X_train = robustScaling(X_train)
    return X_train;

# Avaliação

In [9]:
#removed linear svm. 
def evaluateTechnique(transformer):
    X_train = salaryClassification
    y_train =  salaryClassification['salary-classification']

    X_train = transformer(X_train)
    classifiers = [
        LogisticRegression(max_iter=1000),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(max_iter=10000),

        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = ["Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
            


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [10]:
evaluateTechnique(standardScaling)

Accuracy: 0.785 (+/- 0.016) || AUROC 0.683 -> Logistic regression
Accuracy: 0.670 (+/- 0.429) || AUROC 0.565 -> SGDClassifier
Accuracy: 0.777 (+/- 0.003) || AUROC 0.672 -> KNearest Neighbors (5)
Accuracy: 0.793 (+/- 0.002) || AUROC 0.564 -> SVM-rbf
Accuracy: 0.796 (+/- 0.006) || AUROC 0.846 -> Gaussian naive bayes
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Decision Tree
Accuracy: 0.816 (+/- 0.043) || AUROC 0.772 -> Multi-layer Perceptron
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> AdaBoost
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Random Forest


In [18]:
evaluateTechnique(robustScaling)

Accuracy: 0.759 (+/- 0.000) || AUROC 0.507 -> Logistic regression
Accuracy: 0.552 (+/- 0.508) || AUROC 0.508 -> SGDClassifier
Accuracy: 0.718 (+/- 0.007) || AUROC 0.562 -> KNearest Neighbors (5)
Accuracy: 0.759 (+/- 0.000) || AUROC 0.475 -> SVM-rbf
Accuracy: 0.759 (+/- 0.000) || AUROC 0.803 -> Gaussian naive bayes
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Decision Tree
Accuracy: 0.657 (+/- 0.404) || AUROC 0.634 -> Multi-layer Perceptron
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> AdaBoost
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Random Forest


In [19]:
evaluateTechnique(discretize)

Accuracy: 0.759 (+/- 0.000) || AUROC 0.507 -> Logistic regression
Accuracy: 0.759 (+/- 0.000) || AUROC 0.507 -> SGDClassifier
Accuracy: 0.718 (+/- 0.007) || AUROC 0.562 -> KNearest Neighbors (5)
Accuracy: 0.759 (+/- 0.000) || AUROC 0.482 -> SVM-rbf
Accuracy: 0.759 (+/- 0.000) || AUROC 0.805 -> Gaussian naive bayes
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Decision Tree
Accuracy: 0.759 (+/- 0.000) || AUROC 0.528 -> Multi-layer Perceptron
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> AdaBoost
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Random Forest


In [20]:
evaluateTechnique(normalize)

Accuracy: 0.759 (+/- 0.000) || AUROC 0.507 -> Logistic regression
Accuracy: 0.552 (+/- 0.508) || AUROC 0.508 -> SGDClassifier
Accuracy: 0.718 (+/- 0.007) || AUROC 0.562 -> KNearest Neighbors (5)
Accuracy: 0.759 (+/- 0.000) || AUROC 0.485 -> SVM-rbf
Accuracy: 0.759 (+/- 0.000) || AUROC 0.803 -> Gaussian naive bayes
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Decision Tree
Accuracy: 0.764 (+/- 0.019) || AUROC 0.569 -> Multi-layer Perceptron
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> AdaBoost
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Random Forest


# Avaliação com os conjunto de dados de teste

# Standardization

In [1]:
def standardScalingTest(train,test): 
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train[ftd] )
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

def robustScalingTest(train,test):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

# Discretization

In [2]:
def discretizeTest(train, test):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    test[ftd] = discretizer.transform(test[ftd])
    return train, test;

# Normalization 

In [3]:
def normalizeTest(train,test):
    normalizer = Normalizer()
    train = normalizer.fit_transform(train)
    test = normalizer.transform(test)
    return train, test;

# Avaliação

In [8]:
def evaluateTechniqueTestData(transformer):
    
    X_train = salaryClassification
    y_train = salaryClassification['salary-classification']

    X_test = testData
    y_test = testData['salary-classification']
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(max_iter=100),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;

In [9]:
evaluateTechniqueTestData(standardScalingTest)

Accuracy: 0.807 || AUROC 0.653 || (Accuracy, Precision) 0:( 0.945, 0.827)  1:( 0.361, 0.669) -> Logistic regression
Accuracy: 0.802 || AUROC 0.588 || (Accuracy, Precision) 0:( 0.993, 0.797)  1:( 0.182, 0.896) -> SGDClassifier
Accuracy: 0.806 || AUROC 0.684 || (Accuracy, Precision) 0:( 0.916, 0.844)  1:( 0.451, 0.625) -> KNearest Neighbors (5)
Accuracy: 0.822 || AUROC 0.663 || (Accuracy, Precision) 0:( 0.965, 0.830)  1:( 0.362, 0.762) -> SVM-rbf
Accuracy: 0.806 || AUROC 0.636 || (Accuracy, Precision) 0:( 0.958, 0.819)  1:( 0.314, 0.698) -> SMV-linear
Accuracy: 0.801 || AUROC 0.605 || (Accuracy, Precision) 0:( 0.977, 0.804)  1:( 0.232, 0.754) -> Gaussian naive bayes
Accuracy: 0.819 || AUROC 0.697 || (Accuracy, Precision) 0:( 0.928, 0.849)  1:( 0.465, 0.667) -> Decision Tree
Accuracy: 0.825 || AUROC 0.694 || (Accuracy, Precision) 0:( 0.943, 0.846)  1:( 0.444, 0.708) -> Multi-layer Perceptron
Accuracy: 0.834 || AUROC 0.687 || (Accuracy, Precision) 0:( 0.965, 0.841)  1:( 0.410, 0.784) -> Ad

In [10]:
evaluateTechniqueTestData(discretizeTest)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> Logistic regression


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SGDClassifier
Accuracy: 0.722 || AUROC 0.538 || (Accuracy, Precision) 0:( 0.887, 0.779)  1:( 0.188, 0.340) -> KNearest Neighbors (5)


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SVM-rbf


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SMV-linear
Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> Gaussian naive bayes
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> Decision Tree
Accuracy: 0.780 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.996, 0.778)  1:( 0.084, 0.866) -> Multi-layer Perceptron
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> AdaBoost
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> Random Forest


In [11]:
evaluateTechniqueTestData(normalizeTest)

Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> Logistic regression
Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SGDClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.871 || AUROC 0.804 || (Accuracy, Precision) 0:( 0.931, 0.903)  1:( 0.678, 0.752) -> KNearest Neighbors (5)


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SVM-rbf


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> SMV-linear
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 0.999, 1.000)  1:( 1.000, 0.998) -> Gaussian naive bayes
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> Decision Tree


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 || AUROC 0.500 || (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) -> Multi-layer Perceptron
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> AdaBoost
Accuracy: 1.000 || AUROC 1.000 || (Accuracy, Precision) 0:( 1.000, 1.000)  1:( 1.000, 1.000) -> Random Forest
