# Data transformation: Normalization/Standardization/Discretization

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder

%matplotlib inline

### Load data

In [24]:
salaryClassification = pd.read_csv('data/training.csv', sep = ';')

#treat missing values
salaryClassification = salaryClassification.replace(' ?', np.NaN)
salaryClassification[' workclass'] = salaryClassification[' workclass'].replace(np.NaN, 'Unknown')
salaryClassification[' occupation'] = salaryClassification[' occupation'].replace(np.NaN, 'Other')
salaryClassification[' native-country'] = salaryClassification[' native-country'].replace(np.NaN, 'Other')


#convert 'workclass' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["workclass"] = oe.fit_transform(salaryClassification[[" workclass"]]).astype(int)
salaryClassification = salaryClassification.drop(' workclass', 1)

#fix column names
salaryClassification['fnlwgt'] = salaryClassification[' fnlwgt']
salaryClassification = salaryClassification.drop(' fnlwgt',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["education"] = oe.fit_transform(salaryClassification[[" education"]]).astype(int)
salaryClassification = salaryClassification.drop(' education', 1)

#fix column names
salaryClassification['education-num'] = salaryClassification[' education-num']
salaryClassification = salaryClassification.drop(' education-num',1)

#convert 'education' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["marital-status"] = oe.fit_transform(salaryClassification[[" marital-status"]]).astype(int)
salaryClassification = salaryClassification.drop(' marital-status', 1)

#convert 'occupation'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["occupation"] = oe.fit_transform(salaryClassification[[" occupation"]]).astype(int)
salaryClassification = salaryClassification.drop(' occupation', 1)

#convert 'relationship'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["relationship"] = oe.fit_transform(salaryClassification[[" relationship"]]).astype(int)
salaryClassification = salaryClassification.drop(' relationship', 1)

#convert 'race'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["race"] = oe.fit_transform(salaryClassification[[" race"]]).astype(int)
salaryClassification = salaryClassification.drop(' race', 1)

#convert 'sex'n' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["sex"] = oe.fit_transform(salaryClassification[[" sex"]]).astype(int)
salaryClassification = salaryClassification.drop(' sex', 1)

#join 2 columns(capital-gain and  capital-loss) in one
salaryClassification['capital-diff'] = salaryClassification[' capital-gain'] - salaryClassification[' capital-loss']
salaryClassification = salaryClassification.drop(' capital-gain', 1)
salaryClassification = salaryClassification.drop(' capital-loss', 1)

#fix column names
salaryClassification['hours-per-week'] = salaryClassification[' hours-per-week']
salaryClassification = salaryClassification.drop(' hours-per-week',1)


#convert 'native-country' from categorical to numeric
oe = OrdinalEncoder()
salaryClassification["native-county"] = oe.fit_transform(salaryClassification[[" native-country"]]).astype(int)
salaryClassification = salaryClassification.drop(' native-country', 1)


#convert salary-classification' from categorical to numeric
salaryClassification[' salary-classification'] = [x.replace(' <=50K', '0') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = [x.replace(' >50K', '1') for x in salaryClassification[' salary-classification']]
salaryClassification[' salary-classification'] = salaryClassification[' salary-classification'].astype(int)

salaryClassification['salary-classification'] = salaryClassification[' salary-classification']
salaryClassification = salaryClassification.drop(' salary-classification',1)


# Standardization

In [25]:
def standardScaling(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    return train;


def robustScaling(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    return train;

# Discretization

In [26]:
def discretize(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    return train;

# Normalization

In [31]:
def normalize(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    train[ftd] = transformer = Normalizer().fit_transform(train[ftd])
    return train;

## Técnica combinada

In [28]:
def discretizeAndScale(train):
    ftd = ['age', 'hours-per-week', 'education-num', 'capital-diff']
    train = discretize(train[ftd])
    train = robustScaling(train[ftd])
    return train;

# Avaliação

In [29]:
def evaluateTechnique(transformer):
    X_train = salaryClassification
    y_train =  salaryClassification['salary-classification']

    X_train = transformer(X_train)
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = ["Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
            


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [30]:
evaluateTechnique(standardScaling)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       39          6   77516          9             13               4   
1       50          5   83311          9             13               2   
2       38          3  215646         11              9               0   
3       53          3  234721          1              7               2   
4       28          3  338409          9             13               2   
...    ...        ...     ...        ...            ...             ...   
32556   27          3  257302          7             12               2   
32557   40          3  154374         11              9               2   
32558   58          3  151910         11              9               6   
32559   22          3  201490         11              9               4   
32560   52          4  287927         11              9               2   

       occupation  relationship  race  sex  capital-diff  hours-per-week  \
0               0      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.783 (+/- 0.017) || AUROC 0.685 -> Logistic regression
Accuracy: 0.788 (+/- 0.007) || AUROC 0.576 -> SGDClassifier
Accuracy: 0.777 (+/- 0.003) || AUROC 0.672 -> KNearest Neighbors (5)
Accuracy: 0.793 (+/- 0.002) || AUROC 0.564 -> SVM-rbf




Accuracy: 0.782 (+/- 0.005) || AUROC 0.660 -> SMV-linear
Accuracy: 0.796 (+/- 0.006) || AUROC 0.846 -> Gaussian naive bayes
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Decision Tree
Accuracy: 0.800 (+/- 0.022) || AUROC 0.757 -> Multi-layer Perceptron
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> AdaBoost
Accuracy: 1.000 (+/- 0.000) || AUROC 1.000 -> Random Forest


In [None]:
evaluateTechnique(robustScaling)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       39          6   77516          9             13               4   
1       50          5   83311          9             13               2   
2       38          3  215646         11              9               0   
3       53          3  234721          1              7               2   
4       28          3  338409          9             13               2   
...    ...        ...     ...        ...            ...             ...   
32556   27          3  257302          7             12               2   
32557   40          3  154374         11              9               2   
32558   58          3  151910         11              9               6   
32559   22          3  201490         11              9               4   
32560   52          4  287927         11              9               2   

       occupation  relationship  race  sex  capital-diff  hours-per-week  \
0               0      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.783 (+/- 0.017) || AUROC 0.685 -> Logistic regression
Accuracy: 0.672 (+/- 0.430) || AUROC 0.567 -> SGDClassifier
Accuracy: 0.777 (+/- 0.003) || AUROC 0.672 -> KNearest Neighbors (5)
Accuracy: 0.793 (+/- 0.002) || AUROC 0.564 -> SVM-rbf




In [48]:
evaluateTechnique(discretize)

ola
       age          workclass   fnlwgt    education   education-num  \
0      0.0          State-gov    77516    Bachelors             2.0   
1      1.0   Self-emp-not-inc    83311    Bachelors             2.0   
2      0.0            Private   215646      HS-grad             1.0   
3      1.0            Private   234721         11th             1.0   
4      0.0            Private   338409    Bachelors             2.0   
...    ...                ...      ...          ...             ...   
32556  0.0            Private   257302   Assoc-acdm             2.0   
32557  0.0            Private   154374      HS-grad             1.0   
32558  1.0            Private   151910      HS-grad             1.0   
32559  0.0            Private   201490      HS-grad             1.0   
32560  1.0       Self-emp-inc   287927      HS-grad             1.0   

            marital-status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   Whit

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1342, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validatio

Accuracy: nan (+/- nan) || AUROC nan -> SGDClassifier
Accuracy: nan (+/- nan) || AUROC nan -> KNearest Neighbors (5)
Accuracy: nan (+/- nan) || AUROC nan -> SVM-rbf


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 1131, in fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 598, in c

Accuracy: nan (+/- nan) || AUROC nan -> SMV-linear
Accuracy: nan (+/- nan) || AUROC nan -> Gaussian naive bayes
Accuracy: nan (+/- nan) || AUROC nan -> Decision Tree


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 210, in fit
    X, y = self._validate_data(X, y)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 598, in check_array
    array = np.

Accuracy: nan (+/- nan) || AUROC nan -> Multi-layer Perceptron
Accuracy: nan (+/- nan) || AUROC nan -> AdaBoost
Accuracy: nan (+/- nan) || AUROC nan -> Random Forest


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 443, in fit
    return super().fit(X, y, sample_weight)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 130, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 503, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_weight_boosting.py", line 513, in _boost_real
    estimator.fit(X, y, sample_weight=sample_weight)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 890, in fit
    super().fit(
  File "

In [None]:
evaluateTechnique(normalize)

In [None]:
evaluateTechnique(discretizeAndScale)

# Avaliação com os conjunto de dados de teste

# Standardization

In [11]:
def standardScalingTest(train,test): 
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train[ftd] )
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

def robustScalingTest(train,test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform(train[ftd])
    scaled_test = scaler.transform(test[ftd])
    return scaled_data, scaled_test;

# Discretization

In [28]:
def discretizeTest(train, test):
    ftd = ['age', ' hours-per-week', ' education-num', ' capital-diff']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    train[ftd] = discretizer.fit_transform(train[ftd])
    test[ftd] = discretizer.transform(test[ftd])
    return train, test;



# Normalization 

# Técnica combinada

# Avaliação

In [29]:
def evaluateTechniqueTestData(transformer):
    
    X_train = salaryClassification
    y_train = salaryClassification[' salary-classification']

    X_test = test
    y_test = test[' salary-classification']
    
    
    X_train, X_test = transformer(X_train, X_test)

    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]

    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]


    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        evaluateModel(name, y_test, predicted)    
    return;  


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;

In [13]:
evaluateTechniqueTestData(standardScalingTest)

Accuracy: 0.807 || AUROC 0.653 || (Accuracy, Precision) 0:( 0.945, 0.827)  1:( 0.361, 0.669) -> Logistic regression
Accuracy: 0.802 || AUROC 0.592 || (Accuracy, Precision) 0:( 0.991, 0.799)  1:( 0.194, 0.865) -> SGDClassifier
Accuracy: 0.806 || AUROC 0.684 || (Accuracy, Precision) 0:( 0.916, 0.844)  1:( 0.451, 0.625) -> KNearest Neighbors (5)
Accuracy: 0.822 || AUROC 0.663 || (Accuracy, Precision) 0:( 0.965, 0.830)  1:( 0.362, 0.762) -> SVM-rbf
Accuracy: 0.806 || AUROC 0.636 || (Accuracy, Precision) 0:( 0.958, 0.819)  1:( 0.314, 0.698) -> SMV-linear
Accuracy: 0.801 || AUROC 0.605 || (Accuracy, Precision) 0:( 0.977, 0.804)  1:( 0.232, 0.754) -> Gaussian naive bayes
Accuracy: 0.819 || AUROC 0.697 || (Accuracy, Precision) 0:( 0.929, 0.849)  1:( 0.466, 0.669) -> Decision Tree
Accuracy: 0.826 || AUROC 0.690 || (Accuracy, Precision) 0:( 0.947, 0.844)  1:( 0.433, 0.717) -> Multi-layer Perceptron
Accuracy: 0.834 || AUROC 0.687 || (Accuracy, Precision) 0:( 0.965, 0.841)  1:( 0.410, 0.784) -> Ad

In [30]:
evaluateTechniqueTestData(discretizeTest)

TypeError: 'tuple' object is not callable