# Data transformation

#### Import libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cluster import KMeans

#### Load training and test data

In [2]:
training = pd.read_csv("training.csv",sep=";")
test= pd.read_csv("test.csv",sep=";")


# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
training[' workclass']= label_encoder.fit_transform(training[' workclass'])
training[' education']= label_encoder.fit_transform(training[' education'])
training[' marital-status']= label_encoder.fit_transform(training[' marital-status'])
training[' occupation']= label_encoder.fit_transform(training[' occupation'])
training[' relationship']= label_encoder.fit_transform(training[' relationship'])
training[' race']= label_encoder.fit_transform(training[' race'])
training[' sex']= label_encoder.fit_transform(training[' sex'])
training[' native-country']= label_encoder.fit_transform(training[' native-country'])
training[' salary-classification']= label_encoder.fit_transform(training[' salary-classification'])

#test

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'species'. 
test[' workclass']= label_encoder.fit_transform(test[' workclass'])
test[' education']= label_encoder.fit_transform(test[' education'])
test[' marital-status']= label_encoder.fit_transform(test[' marital-status'])
test[' occupation']= label_encoder.fit_transform(test[' occupation'])
test[' relationship']= label_encoder.fit_transform(test[' relationship'])
test[' race']= label_encoder.fit_transform(test[' race'])
test[' sex']= label_encoder.fit_transform(test[' sex'])
test[' native-country']= label_encoder.fit_transform(test[' native-country'])
test[' salary-classification']= label_encoder.fit_transform(test[' salary-classification'])



data = training[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target = training[' salary-classification']


data_test = test[['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', 
             ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country']]
target_test = test[' salary-classification']

### Data transformation techniques

#### Standardization

In [3]:
def standardScaling(X_train): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def standardScaling2(X_train, X_test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;


def robustScaling(X_train):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    return scaled_data;

def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

#### Discretization

In [4]:
def discretize(X_train):
    featuresToDiscretize = [' age', ' education-num', ' capital-gain', ' capital-loss']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    return X_train;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['age', ' education-num', 'capital-gain', 'capital-loss']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

#### Normalize

In [6]:
def normalize(X_train):
    X_train = transformer = Normalizer().fit_transform(X_train)
    return X_train;

def normalize2(X_train, X_test):
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(X_train)
    X_test = normalizer.transform(X_test)
    return X_train, X_test;

#### Técnica combinada

In [7]:
def discretizeAndScale(X_train):
    X_train = discretize(X_train)
    X_train = robustScaling(X_train)
    return X_train;

def discretizeAndScale2(X_train, X_test):
    X_train = discretize2(X_train, X_test)
    X_train = robustScaling2(X_train, X_test)
    return X_train, X_test;

### Avaliação das técnicas

In [11]:
def evaluateTechnique(transformer):
    X_train = transformer(data)
    
    classifiers = [
       # LogisticRegression(),

       # KNeighborsClassifier(n_neighbors=5),
       # SVC(),
       # GaussianNB(),


       # MLPClassifier(max_iter=10000),
      #  AdaBoostClassifier(),
      #  RandomForestClassifier(),
        
        LogisticRegression(),
        LinearRegression(),
        SVC(),
        KMeans(),
        KNeighborsClassifier(n_neighbors=29),
        GaussianNB(),
        
    ]

    names = [
             "Logistic regression", "Linear regression",
             "SMV", "KMeans", "KNearest Neighbors (29)"
             , "Gaussian naive bayes"]


    for name, clf in zip(names, classifiers):
        scores = cross_validate(clf, X_train, target, cv=5, scoring={'accuracy', 'roc_auc'})
        print("Accuracy: %0.3f (+/- %0.3f) || AUROC %0.3f ->" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc'].mean()), name)
        
    return;

In [12]:
evaluateTechnique(standardScaling)

Accuracy: 0.825 (+/- 0.002) || AUROC 0.854 -> Logistic regression


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [13]:
evaluateTechnique(robustScaling)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 0.822 (+/- 0.006) || AUROC 0.850 -> Logistic regression
Accuracy: 0.797 (+/- 0.007) || AUROC 0.763 -> SGDClassifier
Accuracy: 0.845 (+/- 0.010) || AUROC 0.873 -> KNearest Neighbors (5)
Accuracy: 0.802 (+/- 0.003) || AUROC 0.838 -> SVM-rbf


Traceback (most recent call last):
  File "c:\users\maria\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\maria\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 657, in fit
    self.base_estimator_.fit(X, y)
  File "c:\users\maria\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 237, in fit
    self.log_marginal_likelihood(self.kernel_.theta)
  File "c:\users\maria\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\gaussian_process\_gpc.py", line 357, in log_marginal_likelihood
    K = kernel(self.X_train_)
  File "c:\users\maria\appdata\local\programs\python\python38-32\lib\site-packages\sklearn\gaussian_process\kernels.py", line 857, in __call__
    return self.k1(X, Y) * self.k2(X, Y)
  File "c:\users\maria\appdata\local

Accuracy: 0.795 (+/- 0.035) || AUROC 0.770 -> SMV-linear
Accuracy: 0.799 (+/- 0.008) || AUROC 0.855 -> Gaussian naive bayes
Accuracy: nan (+/- nan) || AUROC nan -> Gaussian Process
Accuracy: 0.808 (+/- 0.005) || AUROC 0.742 -> Decision Tree
Accuracy: 0.830 (+/- 0.026) || AUROC 0.843 -> Multi-layer Perceptron
Accuracy: 0.859 (+/- 0.010) || AUROC 0.914 -> AdaBoost
Accuracy: 0.858 (+/- 0.008) || AUROC 0.906 -> Random Forest


In [20]:
evaluateTechnique(discretize)

KeyError: "[' age'] not in index"