# Data transformation

#### Import libraries

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
%matplotlib inline

#### Load training and test data

In [2]:
training = pd.read_csv("training.csv",sep=";")
test= pd.read_csv("test.csv",sep=";")


# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
training[' workclass']= label_encoder.fit_transform(training[' workclass'])
training[' education']= label_encoder.fit_transform(training[' education'])
training[' marital-status']= label_encoder.fit_transform(training[' marital-status'])
training[' occupation']= label_encoder.fit_transform(training[' occupation'])
training[' relationship']= label_encoder.fit_transform(training[' relationship'])
training[' race']= label_encoder.fit_transform(training[' race'])
training[' sex']= label_encoder.fit_transform(training[' sex'])
training[' native-country']= label_encoder.fit_transform(training[' native-country'])
training[' salary-classification']= label_encoder.fit_transform(training[' salary-classification'])

#test

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
test[' workclass']= label_encoder.fit_transform(test[' workclass'])
test[' education']= label_encoder.fit_transform(test[' education'])
test[' marital-status']= label_encoder.fit_transform(test[' marital-status'])
test[' occupation']= label_encoder.fit_transform(test[' occupation'])
test[' relationship']= label_encoder.fit_transform(test[' relationship'])
test[' race']= label_encoder.fit_transform(test[' race'])
test[' sex']= label_encoder.fit_transform(test[' sex'])
test[' native-country']= label_encoder.fit_transform(test[' native-country'])
test[' salary-classification']= label_encoder.fit_transform(test[' salary-classification'])



data = training[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target = training[' salary-classification']


data_test = test[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target_test = test[' salary-classification']

### Data transformation techniques

#### Standardization

In [3]:
def standardScaling(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data

#### Discretization

In [4]:
def discretize(X_train):
    featuresToDiscretize = ['age', ' education-num', ' capital-gain', ' capital-loss']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train.loc[:,featuresToDiscretize] = discretizer.fit_transform(X_train.loc[:,featuresToDiscretize])
    return X_train;

#### Normalize

In [5]:
def normalize(X_train):
    X_train = transformer = Normalizer().fit_transform(X_train)
    return X_train;

#### Técnica combinada

In [6]:
def discretizeAndScale(X_train):
    X_train = discretize(X_train)
    X_train = robustScaling(X_train)
    return X_train;

### Avaliação das técnicas

In [7]:
def evaluateTechnique(transformer):
    X_train = transformer(data)
    y_train = target
    
    classifiers = [
        LogisticRegression(max_iter=10000),
        KNeighborsClassifier(n_neighbors=2),
        SVC(),
        GaussianNB(),
    ]

    names = [
             "Logistic regression",
             "KNearest Neighbors (2)", 
             "SVM", 
             "Gaussian naive bayes"
        ]


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, y_train, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [8]:
evaluateTechnique(standardScaling)

Accuracy: 0.825 (+/- 0.002) || AUROC 0.854 -> Logistic regression
Accuracy: 0.815 (+/- 0.008) || AUROC 0.790 -> KNearest Neighbors (2)
Accuracy: 0.848 (+/- 0.008) || AUROC 0.892 -> SVM
Accuracy: 0.804 (+/- 0.010) || AUROC 0.857 -> Gaussian naive bayes


In [9]:
evaluateTechnique(robustScaling)

Accuracy: 0.826 (+/- 0.003) || AUROC 0.854 -> Logistic regression
Accuracy: 0.841 (+/- 0.006) || AUROC 0.817 -> KNearest Neighbors (2)
Accuracy: 0.802 (+/- 0.003) || AUROC 0.838 -> SVM
Accuracy: 0.799 (+/- 0.008) || AUROC 0.855 -> Gaussian naive bayes


In [10]:
evaluateTechnique(discretize)

Accuracy: 0.759 (+/- 0.000) || AUROC 0.507 -> Logistic regression
Accuracy: 0.741 (+/- 0.003) || AUROC 0.571 -> KNearest Neighbors (2)
Accuracy: 0.759 (+/- 0.000) || AUROC 0.499 -> SVM
Accuracy: 0.764 (+/- 0.004) || AUROC 0.757 -> Gaussian naive bayes


In [11]:
evaluateTechnique(normalize)

Accuracy: 0.759 (+/- 0.000) || AUROC 0.581 -> Logistic regression
Accuracy: 0.773 (+/- 0.007) || AUROC 0.713 -> KNearest Neighbors (2)
Accuracy: 0.759 (+/- 0.000) || AUROC 0.669 -> SVM
Accuracy: 0.771 (+/- 0.005) || AUROC 0.740 -> Gaussian naive bayes


In [12]:
evaluateTechnique(discretizeAndScale)

Accuracy: 0.808 (+/- 0.004) || AUROC 0.828 -> Logistic regression
Accuracy: 0.797 (+/- 0.006) || AUROC 0.757 -> KNearest Neighbors (2)
Accuracy: 0.816 (+/- 0.008) || AUROC 0.855 -> SVM
Accuracy: 0.775 (+/- 0.004) || AUROC 0.833 -> Gaussian naive bayes
