# Data transformation: Normalization/Standardization/Discretization

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

%matplotlib inline

### Load data

In [69]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')
salaryClassification[' capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']

salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)


salaryClassification[' salary-classification'] = [x.replace(' <=50K', '0') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = [x.replace(' >50K', '1') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = salaryClassification[' salary-classification'].astype(int)

test = pd.read_csv('data/test.csv', sep =';')

test[' capital-diff'] = test[' capital-gain'] - test[' capital-loss']

test = test.drop(' capital-gain', 1)
test = test.drop(' capital-loss', 1)

test[' salary-classification'] = [x.replace(' <=50K', '0') for x in test[' salary-classification']]
test[' salary-classification'] = [x.replace(' >50K', '1') for x in test[' salary-classification']]
test[' salary-classification'] = test[' salary-classification'].astype(int)


# Standardization

In [56]:
def standardScaling(train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    return scaled_data;


def robustScaling(train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    return scaled_data;

# Discretization

In [57]:
def discretize(train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    return train;

# Normalization

In [58]:
def normalize(train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    train = transformer = Normalizer().fit_transform(train[ftd])
    return train;

## Técnica combinada

In [59]:
def discretizeAndScale(train):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    train = discretize(train[ftd])
    train = robustScaling(train[ftd])
    return train;

# Avaliação

In [60]:
def evaluateTechnique(transformer):
    X_train = salaryClassification
    y_train =  salaryClassification[' salary-classification']

    X_train = transformer(X_train)
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = ["Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
            


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [61]:
evaluateTechnique(standardScaling)

Accuracy: 0.806 (+/- 0.010) || AUROC 0.818 -> Logistic regression
Accuracy: 0.798 (+/- 0.005) || AUROC 0.799 -> SGDClassifier
Accuracy: 0.802 (+/- 0.004) || AUROC 0.785 -> KNearest Neighbors (5)
Accuracy: 0.822 (+/- 0.004) || AUROC 0.831 -> SVM-rbf
Accuracy: 0.805 (+/- 0.009) || AUROC 0.817 -> SMV-linear
Accuracy: 0.798 (+/- 0.006) || AUROC 0.836 -> Gaussian naive bayes
Accuracy: 0.813 (+/- 0.007) || AUROC 0.785 -> Decision Tree
Accuracy: 0.827 (+/- 0.005) || AUROC 0.854 -> Multi-layer Perceptron
Accuracy: 0.835 (+/- 0.003) || AUROC 0.861 -> AdaBoost
Accuracy: 0.821 (+/- 0.003) || AUROC 0.830 -> Random Forest


In [62]:
evaluateTechnique(robustScaling)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.806 (+/- 0.010) || AUROC 0.818 -> Logistic regression
Accuracy: 0.774 (+/- 0.016) || AUROC 0.713 -> SGDClassifier
Accuracy: 0.825 (+/- 0.004) || AUROC 0.814 -> KNearest Neighbors (5)
Accuracy: 0.802 (+/- 0.003) || AUROC 0.821 -> SVM-rbf




Accuracy: 0.784 (+/- 0.018) || AUROC 0.727 -> SMV-linear
Accuracy: 0.797 (+/- 0.006) || AUROC 0.834 -> Gaussian naive bayes
Accuracy: 0.813 (+/- 0.007) || AUROC 0.785 -> Decision Tree
Accuracy: 0.800 (+/- 0.026) || AUROC 0.794 -> Multi-layer Perceptron
Accuracy: 0.835 (+/- 0.003) || AUROC 0.861 -> AdaBoost
Accuracy: 0.820 (+/- 0.002) || AUROC 0.830 -> Random Forest


In [63]:
evaluateTechnique(discretize)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1342, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validatio

Accuracy: nan (+/- nan) || AUROC nan -> Logistic regression
Accuracy: nan (+/- nan) || AUROC nan -> SGDClassifier
Accuracy: nan (+/- nan) || AUROC nan -> KNearest Neighbors (5)


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 1131, in fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 598, in c

Accuracy: nan (+/- nan) || AUROC nan -> SVM-rbf
Accuracy: nan (+/- nan) || AUROC nan -> SMV-linear
Accuracy: nan (+/- nan) || AUROC nan -> Gaussian naive bayes


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_classes.py", line 227, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr',
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 598, in check

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 890, in fit
    super().fit(
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 156, in fit
    X, y = self._validate_data(X, y,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 429, in _validate_data
    X = check_array(X, **check_X_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 598, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy

Accuracy: nan (+/- nan) || AUROC nan -> Decision Tree
Accuracy: nan (+/- nan) || AUROC nan -> Multi-layer Perceptron
Accuracy: nan (+/- nan) || AUROC nan -> AdaBoost
Accuracy: nan (+/- nan) || AUROC nan -> Random Forest


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 443, in fit
    return super().fit(X, y, sample_weight)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 130, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 503, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 513, in _boost_real
    estimator.fit(X, y, sample_weight=sample_weight)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 890, in fit
    super().fit(
  File "

In [64]:
evaluateTechnique(normalize)

Accuracy: 0.780 (+/- 0.017) || AUROC 0.739 -> Logistic regression
Accuracy: 0.764 (+/- 0.001) || AUROC 0.740 -> SGDClassifier
Accuracy: 0.752 (+/- 0.048) || AUROC 0.686 -> KNearest Neighbors (5)
Accuracy: 0.784 (+/- 0.008) || AUROC 0.718 -> SVM-rbf
Accuracy: 0.764 (+/- 0.001) || AUROC 0.739 -> SMV-linear
Accuracy: 0.764 (+/- 0.001) || AUROC 0.736 -> Gaussian naive bayes
Accuracy: 0.785 (+/- 0.008) || AUROC 0.756 -> Decision Tree
Accuracy: 0.785 (+/- 0.009) || AUROC 0.755 -> Multi-layer Perceptron
Accuracy: 0.785 (+/- 0.008) || AUROC 0.756 -> AdaBoost
Accuracy: 0.785 (+/- 0.008) || AUROC 0.756 -> Random Forest


In [65]:
evaluateTechnique(discretizeAndScale)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[ftd] = discretizer.fit_transform(train[ftd])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.iloc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)


Accuracy: 0.783 (+/- 0.006) || AUROC 0.757 -> Logistic regression
Accuracy: 0.764 (+/- 0.001) || AUROC 0.749 -> SGDClassifier
Accuracy: 0.772 (+/- 0.034) || AUROC 0.678 -> KNearest Neighbors (5)
Accuracy: 0.785 (+/- 0.007) || AUROC 0.675 -> SVM-rbf
Accuracy: 0.785 (+/- 0.007) || AUROC 0.757 -> SMV-linear
Accuracy: 0.764 (+/- 0.001) || AUROC 0.742 -> Gaussian naive bayes
Accuracy: 0.785 (+/- 0.007) || AUROC 0.760 -> Decision Tree
Accuracy: 0.785 (+/- 0.006) || AUROC 0.761 -> Multi-layer Perceptron
Accuracy: 0.785 (+/- 0.008) || AUROC 0.761 -> AdaBoost
Accuracy: 0.785 (+/- 0.007) || AUROC 0.761 -> Random Forest


# Avaliação com os conjunto de dados de teste

# Standardization

In [66]:
def standardScalingTest(train,test): 
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train[ftd] )
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

def robustScalingTest(train,test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

# Discretization

# Normalization 

# Técnica combinada

# Avaliação

In [70]:
def evaluateTechniqueTestData(transformer):
    
    X_train = salaryClassification
    y_train = salaryClassification[' salary-classification']

    X_test = test
    y_test = test[' salary-classification']
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;

In [None]:
evaluateTechniqueTestData(standardScalingTest)

Accuracy: 0.807 || AUROC 0.653 || (Accuracy, Precision) 0:( 0.945, 0.827)  1:( 0.361, 0.669) -> Logistic regression
Accuracy: 0.803 || AUROC 0.608 || (Accuracy, Precision) 0:( 0.978, 0.806)  1:( 0.238, 0.769) -> SGDClassifier
Accuracy: 0.806 || AUROC 0.684 || (Accuracy, Precision) 0:( 0.916, 0.844)  1:( 0.451, 0.625) -> KNearest Neighbors (5)
Accuracy: 0.822 || AUROC 0.663 || (Accuracy, Precision) 0:( 0.965, 0.830)  1:( 0.362, 0.762) -> SVM-rbf
Accuracy: 0.806 || AUROC 0.636 || (Accuracy, Precision) 0:( 0.958, 0.819)  1:( 0.314, 0.698) -> SMV-linear
Accuracy: 0.801 || AUROC 0.605 || (Accuracy, Precision) 0:( 0.977, 0.804)  1:( 0.232, 0.754) -> Gaussian naive bayes
