# Data transformation: Normalization/Standardization/Discretization

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder

from sklearn.exceptions import ConvergenceWarning

%matplotlib inline

### Load train dataset

In [2]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

X_train = salaryClassification.drop(' salary-classification', 1)
y_train =  salaryClassification[' salary-classification']

#treat missing values
salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["workclass"] = oe.fit_transform(salaryClassification[[" workclass"]]).astype(int)
salaryClassification = salaryClassification.drop(' workclass', 1)

#fix column names
salaryClassification['fnlwgt'] = salaryClassification[' fnlwgt']
salaryClassification = salaryClassification.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["education"] = oe.fit_transform(salaryClassification[[" education"]]).astype(int)
salaryClassification = salaryClassification.drop(' education', 1)

#fix column names
salaryClassification['education-num'] = salaryClassification[' education-num']
salaryClassification = salaryClassification.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["marital-status"] = oe.fit_transform(salaryClassification[[" marital-status"]]).astype(int)
salaryClassification = salaryClassification.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["occupation"] = oe.fit_transform(salaryClassification[[" occupation"]]).astype(int)
salaryClassification = salaryClassification.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["relationship"] = oe.fit_transform(salaryClassification[[" relationship"]]).astype(int)
salaryClassification = salaryClassification.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["race"] = oe.fit_transform(salaryClassification[[" race"]]).astype(int)
salaryClassification = salaryClassification.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["sex"] = oe.fit_transform(salaryClassification[[" sex"]]).astype(int)
salaryClassification = salaryClassification.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
salaryClassification['capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']
salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)

#fix column names
salaryClassification['hours-per-week'] = salaryClassification[' hours-per-week']
salaryClassification = salaryClassification.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["native-county"] = oe.fit_transform(salaryClassification[[" native-country"]]).astype(int)
salaryClassification = salaryClassification.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
salaryClassification[' salary-classification'] = [x.replace(' <=50K', '0') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = [x.replace(' >50K', '1') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = salaryClassification[' salary-classification'].astype(int)

salaryClassification['salary-classification'] = salaryClassification[' salary-classification']
salaryClassification = salaryClassification.drop(' salary-classification',1)

### Load test dataset

In [3]:
testData = pd.read_csv('data/test.csv', sep = ';')

X_test = testData.drop(' salary-classification', 1)
y_test =  testData[' salary-classification']

#treat missing values
testData = testData.replace(' ?', np.NaN)
testData[' workclass'] = testData[' workclass'].replace(np.NaN, 'Unknown')
testData[' occupation'] = testData[' occupation'].replace(np.NaN, 'Other')
testData[' native-country'] = testData[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
testData["workclass"] = oe.fit_transform(testData[[" workclass"]]).astype(int)
testData = testData.drop(' workclass', 1)

#fix column name
testData['fnlwgt'] = testData[' fnlwgt']
testData = testData.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["education"] = oe.fit_transform(testData[[" education"]]).astype(int)
testData = testData.drop(' education', 1)

#fix column name
testData['education-num'] = testData[' education-num']
testData = testData.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
testData["marital-status"] = oe.fit_transform(testData[[" marital-status"]]).astype(int)
testData = testData.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
testData["occupation"] = oe.fit_transform(testData[[" occupation"]]).astype(int)
testData = testData.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
testData["relationship"] = oe.fit_transform(testData[[" relationship"]]).astype(int)
testData = testData.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
testData["race"] = oe.fit_transform(testData[[" race"]]).astype(int)
testData = testData.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
testData["sex"] = oe.fit_transform(testData[[" sex"]]).astype(int)
testData = testData.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
testData['capital-diff'] = testData[' capital-gain'] - testData[' capital-loss']
testData = testData.drop(' capital-gain', 1)
testData = testData.drop(' capital-loss', 1)

#fix column name
testData['hours-per-week'] = testData[' hours-per-week']
testData = testData.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
testData["native-county"] = oe.fit_transform(testData[[" native-country"]]).astype(int)
testData = testData.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
testData[' salary-classification'] = [x.replace(' <=50K', '0') for x in testData[' salary-classification']]
testData[' salary-classification'] = [x.replace(' >50K', '1') for x in testData[' salary-classification']]
testData[' salary-classification'] = testData[' salary-classification'].astype(int)

testData['salary-classification'] = testData[' salary-classification']
testData = testData.drop(' salary-classification',1)

# Standardization

In [3]:
def standardScaling(train):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train)
    return train;


def robustScaling(train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train)
    return train;

# Discretization

In [4]:
def discretize(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    return train;

# Normalization

In [5]:
def normalize(train):
    train = transformer = Normalizer().fit_transform(train)
    return train;

# Avaliação

In [10]:
#removed linear svm. 
def evaluateTechnique(transformer):
    X_train = salaryClassification.drop('salary-classification', 1)
    y_train =  salaryClassification['salary-classification']

    X_train = transformer(X_train)
    classifiers = [
        LogisticRegression(max_iter=1000),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(max_iter=10000),

        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = ["Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", 
             
             "Gaussian Naive Bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
            
    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) | AUROC %0.3f || " % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [11]:
evaluateTechnique(standardScaling)

Accuracy: 0.790 (+/- 0.017) | AUROC 0.683 ||  Logistic regression
Accuracy: 0.786 (+/- 0.007) | AUROC 0.576 ||  SGDClassifier
Accuracy: 0.777 (+/- 0.003) | AUROC 0.672 ||  KNearest Neighbors (5)
Accuracy: 0.793 (+/- 0.002) | AUROC 0.560 ||  SVM-rbf
Accuracy: 0.795 (+/- 0.006) | AUROC 0.833 ||  Gaussian Naive Bayes
Accuracy: 0.810 (+/- 0.006) | AUROC 0.745 ||  Decision Tree
Accuracy: 0.701 (+/- 0.342) | AUROC 0.658 ||  Multi-layer Perceptron
Accuracy: 0.860 (+/- 0.010) | AUROC 0.914 ||  AdaBoost
Accuracy: 0.856 (+/- 0.008) | AUROC 0.904 ||  Random Forest


In [12]:
evaluateTechnique(robustScaling)

Accuracy: 0.790 (+/- 0.017) | AUROC 0.683 ||  Logistic regression
Accuracy: 0.673 (+/- 0.433) | AUROC 0.572 ||  SGDClassifier
Accuracy: 0.777 (+/- 0.003) | AUROC 0.672 ||  KNearest Neighbors (5)
Accuracy: 0.793 (+/- 0.002) | AUROC 0.560 ||  SVM-rbf
Accuracy: 0.795 (+/- 0.006) | AUROC 0.833 ||  Gaussian Naive Bayes
Accuracy: 0.808 (+/- 0.007) | AUROC 0.744 ||  Decision Tree
Accuracy: 0.623 (+/- 0.439) | AUROC 0.643 ||  Multi-layer Perceptron
Accuracy: 0.860 (+/- 0.010) | AUROC 0.914 ||  AdaBoost
Accuracy: 0.856 (+/- 0.007) | AUROC 0.904 ||  Random Forest


In [13]:
evaluateTechnique(discretize)

Accuracy: 0.759 (+/- 0.000) | AUROC 0.507 ||  Logistic regression
Accuracy: 0.759 (+/- 0.000) | AUROC 0.507 ||  SGDClassifier
Accuracy: 0.717 (+/- 0.007) | AUROC 0.560 ||  KNearest Neighbors (5)
Accuracy: 0.759 (+/- 0.000) | AUROC 0.491 ||  SVM-rbf
Accuracy: 0.759 (+/- 0.000) | AUROC 0.724 ||  Gaussian Naive Bayes
Accuracy: 0.772 (+/- 0.008) | AUROC 0.691 ||  Decision Tree
Accuracy: 0.759 (+/- 0.001) | AUROC 0.544 ||  Multi-layer Perceptron
Accuracy: 0.834 (+/- 0.013) | AUROC 0.883 ||  AdaBoost
Accuracy: 0.806 (+/- 0.002) | AUROC 0.843 ||  Random Forest


In [14]:
evaluateTechnique(normalize)

Accuracy: 0.777 (+/- 0.001) | AUROC 0.625 ||  Logistic regression
Accuracy: 0.776 (+/- 0.001) | AUROC 0.625 ||  SGDClassifier
Accuracy: 0.789 (+/- 0.003) | AUROC 0.789 ||  KNearest Neighbors (5)
Accuracy: 0.785 (+/- 0.002) | AUROC 0.717 ||  SVM-rbf
Accuracy: 0.793 (+/- 0.003) | AUROC 0.773 ||  Gaussian Naive Bayes
Accuracy: 0.789 (+/- 0.010) | AUROC 0.718 ||  Decision Tree
Accuracy: 0.794 (+/- 0.005) | AUROC 0.735 ||  Multi-layer Perceptron
Accuracy: 0.809 (+/- 0.003) | AUROC 0.837 ||  AdaBoost
Accuracy: 0.840 (+/- 0.008) | AUROC 0.887 ||  Random Forest


# Avaliação com os conjunto de dados de teste

# Standardization

In [4]:
def standardScalingTest(train,test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train)
    scaled_test = scaler.transform(test)
    return scaled_data, scaled_test;

def robustScalingTest(train,test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train)
    scaled_test = scaler.transform(test)
    return scaled_data, scaled_test;

# Discretization

In [5]:
def discretizeTest(train, test):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    test[ftd] = discretizer.transform(test[ftd])
    return train, test;

# Normalization 

In [6]:
def normalizeTest(train,test):
    normalizer = Normalizer()
    train = normalizer.fit_transform(train)
    test = normalizer.transform(test)
    return train, test;

# Avaliação

In [12]:
def evaluateTechniqueTestData(transformer):
    
    X_train = salaryClassification.drop('salary-classification', 1)
    y_train =  salaryClassification['salary-classification']

    X_test = testData.drop('salary-classification', 1)
    y_test = testData['salary-classification']
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(max_iter=1000),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f | AUROC %0.3f | (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) || " 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;

In [13]:
evaluateTechniqueTestData(standardScalingTest)

Accuracy: 0.819 | AUROC 0.684 | (Accuracy, Precision) 0:( 0.940, 0.842)  1:( 0.428, 0.689) ||  Logistic regression
Accuracy: 0.813 | AUROC 0.632 | (Accuracy, Precision) 0:( 0.975, 0.816)  1:( 0.289, 0.779) ||  SGDClassifier
Accuracy: 0.826 | AUROC 0.735 | (Accuracy, Precision) 0:( 0.907, 0.870)  1:( 0.563, 0.652) ||  KNearest Neighbors (5)
Accuracy: 0.846 | AUROC 0.735 | (Accuracy, Precision) 0:( 0.946, 0.865)  1:( 0.524, 0.749) ||  SVM-rbf
Accuracy: 0.825 | AUROC 0.669 | (Accuracy, Precision) 0:( 0.965, 0.833)  1:( 0.373, 0.765) ||  Gaussian naive bayes
Accuracy: 0.810 | AUROC 0.740 | (Accuracy, Precision) 0:( 0.872, 0.878)  1:( 0.607, 0.595) ||  Decision Tree
Accuracy: 0.851 | AUROC 0.767 | (Accuracy, Precision) 0:( 0.926, 0.884)  1:( 0.608, 0.718) ||  Multi-layer Perceptron
Accuracy: 0.861 | AUROC 0.769 | (Accuracy, Precision) 0:( 0.942, 0.883)  1:( 0.597, 0.761) ||  AdaBoost
Accuracy: 0.856 | AUROC 0.769 | (Accuracy, Precision) 0:( 0.933, 0.885)  1:( 0.606, 0.736) ||  Random Forest

In [14]:
evaluateTechniqueTestData(robustScalingTest)

Accuracy: 0.821 | AUROC 0.685 | (Accuracy, Precision) 0:( 0.942, 0.842)  1:( 0.429, 0.695) ||  Logistic regression
Accuracy: 0.801 | AUROC 0.660 | (Accuracy, Precision) 0:( 0.927, 0.832)  1:( 0.393, 0.625) ||  SGDClassifier
Accuracy: 0.846 | AUROC 0.767 | (Accuracy, Precision) 0:( 0.916, 0.886)  1:( 0.617, 0.695) ||  KNearest Neighbors (5)
Accuracy: 0.802 | AUROC 0.618 | (Accuracy, Precision) 0:( 0.968, 0.810)  1:( 0.269, 0.719) ||  SVM-rbf
Accuracy: 0.817 | AUROC 0.646 | (Accuracy, Precision) 0:( 0.970, 0.822)  1:( 0.323, 0.768) ||  Gaussian naive bayes
Accuracy: 0.810 | AUROC 0.740 | (Accuracy, Precision) 0:( 0.872, 0.878)  1:( 0.608, 0.596) ||  Decision Tree
Accuracy: 0.829 | AUROC 0.751 | (Accuracy, Precision) 0:( 0.899, 0.880)  1:( 0.603, 0.649) ||  Multi-layer Perceptron
Accuracy: 0.861 | AUROC 0.769 | (Accuracy, Precision) 0:( 0.942, 0.883)  1:( 0.597, 0.761) ||  AdaBoost
Accuracy: 0.853 | AUROC 0.766 | (Accuracy, Precision) 0:( 0.930, 0.883)  1:( 0.603, 0.727) ||  Random Forest

In [15]:
evaluateTechniqueTestData(discretizeTest)

Accuracy: 0.764 | AUROC 0.500 | (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) ||  Logistic regression


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.236 | AUROC 0.500 | (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 1.000, 0.236) ||  SGDClassifier
Accuracy: 0.721 | AUROC 0.536 | (Accuracy, Precision) 0:( 0.886, 0.779)  1:( 0.186, 0.336) ||  KNearest Neighbors (5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.764 | AUROC 0.500 | (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) ||  SVM-rbf
Accuracy: 0.764 | AUROC 0.500 | (Accuracy, Precision) 0:( 1.000, 0.764)  1:( 0.000, 0.000) ||  Gaussian naive bayes
Accuracy: 0.782 | AUROC 0.705 | (Accuracy, Precision) 0:( 0.851, 0.862)  1:( 0.559, 0.537) ||  Decision Tree
Accuracy: 0.765 | AUROC 0.517 | (Accuracy, Precision) 0:( 0.987, 0.770)  1:( 0.047, 0.528) ||  Multi-layer Perceptron
Accuracy: 0.838 | AUROC 0.744 | (Accuracy, Precision) 0:( 0.922, 0.873)  1:( 0.566, 0.691) ||  AdaBoost
Accuracy: 0.815 | AUROC 0.724 | (Accuracy, Precision) 0:( 0.897, 0.866)  1:( 0.551, 0.622) ||  Random Forest


In [16]:
evaluateTechniqueTestData(normalizeTest)

Accuracy: 0.782 | AUROC 0.542 | (Accuracy, Precision) 0:( 0.996, 0.779)  1:( 0.088, 0.885) ||  Logistic regression
Accuracy: 0.779 | AUROC 0.534 | (Accuracy, Precision) 0:( 0.997, 0.776)  1:( 0.072, 0.890) ||  SGDClassifier
Accuracy: 0.789 | AUROC 0.695 | (Accuracy, Precision) 0:( 0.873, 0.854)  1:( 0.516, 0.557) ||  KNearest Neighbors (5)
Accuracy: 0.789 | AUROC 0.560 | (Accuracy, Precision) 0:( 0.994, 0.786)  1:( 0.126, 0.858) ||  SVM-rbf
Accuracy: 0.794 | AUROC 0.575 | (Accuracy, Precision) 0:( 0.990, 0.792)  1:( 0.160, 0.830) ||  Gaussian naive bayes
Accuracy: 0.790 | AUROC 0.721 | (Accuracy, Precision) 0:( 0.852, 0.871)  1:( 0.590, 0.552) ||  Decision Tree
Accuracy: 0.797 | AUROC 0.597 | (Accuracy, Precision) 0:( 0.977, 0.801)  1:( 0.217, 0.744) ||  Multi-layer Perceptron
Accuracy: 0.808 | AUROC 0.633 | (Accuracy, Precision) 0:( 0.965, 0.817)  1:( 0.302, 0.725) ||  AdaBoost
Accuracy: 0.844 | AUROC 0.749 | (Accuracy, Precision) 0:( 0.929, 0.874)  1:( 0.569, 0.712) ||  Random Forest