### Import Libraries

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import  RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_classif, chi2
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
%matplotlib inline

### Load train and test dataset

In [102]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)



### Funções auxiliares

In [154]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def evaluateSelector(featureSelector, selectorName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

    
    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train, y_train, X_test)
    
    # standardizar
    X_train_transformed, X_test_transformed = robustScaling2(X_train_selected, X_test_selected)
    
    
    classifiers = [
        LogisticRegression(),
        SGDClassifier(),
        KNeighborsClassifier(n_neighbors=5),
        SVC(),
        LinearSVC(max_iter=10000),
        GaussianNB(),
        GaussianProcessClassifier(),
        DecisionTreeClassifier(),
        MLPClassifier(max_iter=10000),
        AdaBoostClassifier(),
        RandomForestClassifier(),
    ]
    
    names = [
             "Logistic regression", "SGDClassifier",
             "KNearest Neighbors (5)", 
             "SVM-rbf", "SMV-linear", 
             "Gaussian naive bayes",
             "Gaussian Process", 
             "Decision Tree", 
             "Multi-layer Perceptron", 
             "AdaBoost", "Random Forest"]
    
    # Treinar e avaliar modelos
    for name, clf in zip(names, classifiers):
        clf.fit(X_train_transformed, y_train)
        predicted = clf.predict(X_test_transformed)
        evaluateModel(name + " com " + selectorName, y_test, predicted)
    return;


def evaluateModelBasedSelector(featureSelector, model, name):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train, y_train, X_test)
    
    
    # Normalizar, discretizar ou standardizar
    X_train_transformed, X_test_transformed = robustScaling2(X_train_selected, X_test_selected)
    
    # Treinar modelo
    model.fit(X_train_transformed, y_train)
    
    # Prever resultados para test set
    predicted = model.predict(X_test_transformed)
    
    # Avaliar modelo
    evaluateModel(name, y_test, predicted)
    return;

def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;


def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;

# Feature Selection

### Funções auxiliares

#### VarianceThreshold

In [155]:
def selectVarianceThreshold(X_train, y_train, X_test):
    varianceThreshold_selector = VarianceThreshold()
    selector = varianceThreshold_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = varianceThreshold_selector.transform(X_train)
    X_test_selected = varianceThreshold_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectKBest

In [156]:
def selectKBest_f_classif(X_train, y_train, X_test):
    kbest_selector_f_classif = SelectKBest(f_classif, k=12)
    selector = kbest_selector_f_classif.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_f_classif.transform(X_train)
    X_test_selected = kbest_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectKBest_chi2(X_train, y_train, X_test):
    kbest_selector_chi2 = SelectKBest(chi2, k=12)
    selector = kbest_selector_chi2.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_chi2.transform(X_train)
    X_test_selected = kbest_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectPercentile

In [157]:
def selectPercentile_f_classif(X_train, y_train, X_test):
    percentile_selector_f_classif = SelectPercentile(f_classif, percentile=10)
    selector = percentile_selector_f_classif.fit(X_train, y_train)
    X_train_selected = percentile_selector_f_classif.transform(X_train)
    X_test_selected = percentile_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_chi2(X_train, y_train, X_test):
    percentile_selector_chi2 = SelectPercentile(chi2, percentile=10)
    selector = percentile_selector_chi2.fit(X_train, y_train)
    X_train_selected = percentile_selector_chi2.transform(X_train)
    X_test_selected = percentile_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### GenericUnivariateSelect

In [158]:
def selectGenericUnivariateSelect(X_train, y_train, X_test):
    gus_selector = GenericUnivariateSelect(f_classif, 'k_best', param=19)
    selector = gus_selector.fit(X_train, y_train)
    X_train_selected = gus_selector.transform(X_train)
    X_test_selected = gus_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination

In [159]:
def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination w/ Cross Validation

In [160]:
def rfeCvLogReg(X_train, y_train, X_test):
    rfecv_log_selector = RFECV(LogisticRegression(), 12)
    selector = rfecv_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfecv_log_selector.transform(X_train)
    X_test_selected = rfecv_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def rfeCvSVC(X_train, y_train, X_test):
    rfecv_svc_selector = RFECV(SVC(kernel='linear'), 12)
    selector = rfecv_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfecv_svc_selector.transform(X_train)
    X_test_selected = rfecv_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectFromModel

In [161]:
def sfmLogReg(X_train, y_train, X_test):
    sfm_logReg_selector = SelectFromModel(estimator=LogisticRegression())
    selector = sfm_logReg_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_logReg_selector.transform(X_train)
    X_test_selected = sfm_logReg_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectFromModel and LassoCV

In [162]:
def sfmLcvLogReg(X_train, y_train, X_test):
    sfmlcv_logReg_selector = SelectFromModel(LassoCV(),threshold=0.25)
    selector = sfmlcv_logReg_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfmlcv_logReg_selector.transform(X_train)
    X_test_selected = sfmlcv_logReg_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### L1-based

In [163]:
def sfmL1(X_train, y_train, X_test):
    lsvc_selector = LinearSVC(C=0.01, penalty="l1", dual=False)
    selector = lsvc_selector.fit(X_train, y_train)
    l1_selector = SelectFromModel(lsvc_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = l1_selector.transform(X_train)
    X_test_selected = l1_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Tree-based

In [164]:
def sfmTree(X_train, y_train, X_test):
    tree_selector = ExtraTreesClassifier(n_estimators=50)
    selector = tree_selector.fit(X_train, y_train)
    sfm_Tree_selector = SelectFromModel(tree_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_Tree_selector.transform(X_train)
    X_test_selected = sfm_Tree_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#  Evaluate different models

### Creating and Training the Models

In [165]:
# Removing features with low variance

evaluateSelector(selectVarianceThreshold, "VarianceThreshold")

Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> Logistic regression com VarianceThreshold
Accuracy: 0.792 || AUROC 0.546 || (Accuracy, Precision) 0:( 0.159, 0.350)  1:( 0.934, 0.832) -> SGDClassifier com VarianceThreshold
Accuracy: 0.804 || AUROC 0.501 || (Accuracy, Precision) 0:( 0.023, 0.200)  1:( 0.980, 0.817) -> KNearest Neighbors (5) com VarianceThreshold
Accuracy: 0.812 || AUROC 0.524 || (Accuracy, Precision) 0:( 0.068, 0.429)  1:( 0.980, 0.824) -> SVM-rbf com VarianceThreshold
Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> SMV-linear com VarianceThreshold
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes com VarianceThreshold
Accuracy: 0.796 || AUROC 0.514 || (Accuracy, Precision) 0:( 0.068, 0.273)  1:( 0.959, 0.821) -> Gaussian Process com VarianceThreshold
Accuracy: 0.738 || AUROC 0.566 || (Accuracy, Precision) 0:( 

In [None]:
# Univariate feature selection
print("Com f_classif\n\n")
evaluateSelector(selectKBest_f_classif, "Select k best f_classif")

print("Com qui2\n\n")
evaluateSelector(selectKBest_chi2, "Select k best qui2")

In [167]:
print("Com f_classif\n\n")
evaluateSelector(selectPercentile_f_classif, "Select Percentile")



Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Logistic regression com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> SGDClassifier com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> KNearest Neighbors (5) com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> SVM-rbf com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> SMV-linear com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian Process com Select Percentile
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 

In [168]:
evaluateSelector(selectGenericUnivariateSelect, "Generic univariate select")

Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> Logistic regression com Generic univariate select
Accuracy: 0.775 || AUROC 0.519 || (Accuracy, Precision) 0:( 0.114, 0.250)  1:( 0.923, 0.823) -> SGDClassifier com Generic univariate select
Accuracy: 0.804 || AUROC 0.501 || (Accuracy, Precision) 0:( 0.023, 0.200)  1:( 0.980, 0.817) -> KNearest Neighbors (5) com Generic univariate select
Accuracy: 0.812 || AUROC 0.524 || (Accuracy, Precision) 0:( 0.068, 0.429)  1:( 0.980, 0.824) -> SVM-rbf com Generic univariate select
Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> SMV-linear com Generic univariate select
Accuracy: 0.808 || AUROC 0.521 || (Accuracy, Precision) 0:( 0.068, 0.375)  1:( 0.974, 0.823) -> Gaussian naive bayes com Generic univariate select
Accuracy: 0.796 || AUROC 0.514 || (Accuracy, Precision) 0:( 0.068, 0.273)  1:( 0.959, 0.821) -> Gaussian Process com Generic univariate select
Accu

In [None]:
# Recursive feature elimination

evaluateModelBasedSelector( rfeLogReg,  LogisticRegression(), "Log Reg w/ RFE LogReg")
evaluateModelBasedSelector( rfeCvLogReg, LogisticRegression(), "Log Reg w/ RFE Cross Validation LogReg")

evaluateModelBasedSelector( rfeSVC, SVC(kernel='linear'), "SVC Linear w/ RFE SVC")
evaluateModelBasedSelector( rfeCvSVC, SVC(kernel='linear'), "SVC Linear w/ RFE Cross Validation SVC")

# Feature selection using SelectFromModel


evaluateModelBasedSelector( sfmLcvLogReg, LogisticRegression(), "Log Reg w/ SelectFromModel and LassoCV")

#evaluateModelBasedSelector( sfmL1, SVC(kernel='linear'), "SVC Linear w/ SelectFromModel L1")

evaluateModelBasedSelector( sfmTree, SVC(kernel='linear'), "SVC Linear w/ SelectFromModel Tree")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Accuracy: 0.792 || AUROC 0.511 || (Accuracy, Precision) 0:( 0.068, 0.250)  1:( 0.954, 0.820) -> Log Reg w/ RFE LogReg
Accuracy: 0.804 || AUROC 0.545 || (Accuracy, Precision) 0:( 0.136, 0.400)  1:( 0.954, 0.831) -> Log Reg w/ RFE Cross Validation LogReg
