### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Load train and test dataset

In [2]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)



# Funções auxiliares

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [4]:
def TransformBalanceSelectTrainPredict(featureSelector, model, name):
    
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

    # Normalizar, discretizar ou standardizar
    X_train_transformed, X_test_transformed = discretize(X_train, X_test)
    
    # Balancear Data Set
    X_train_balanced, y_train_balanced = overSampler(X_train_transformed, y_train)
    
    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train_balanced, y_train_balanced, X_test_transformed)
    
    # Treinar modelo
    model.fit(X_train_selected, y_train_balanced)
    
    # Prever resultados para test set
    predicted = model.predict(X_test_selected)
    
    # Avaliar modelo
    evaluateModel(name, y_test, predicted)
    return;

def evaluateModel(name, y_test, predicted):
    print("".join(["Prec 0: ",'%.3f' % precision_score(y_test,predicted,pos_label=0),
                   "; Prec 1: ",'%.3f' % precision_score(y_test,predicted,pos_label=1),
                   "; Rec 0: ",'%.3f' % recall_score(y_test,predicted,pos_label=0),
                   "; Rec 1: ",'%.3f' % recall_score(y_test,predicted,pos_label=1),
                  "; Acc: ",'%.3f' % accuracy_score(y_test,predicted), "; -> " , name]))
    return;


# Balance Dataset

### XXXXX Up-sample minority class

#### xxxx Resample with replacement

In [5]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

# Data transformation

### xxxx Discretization

In [6]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

# Feature Selection

In [7]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_classif, chi2
from sklearn.decomposition import PCA

### Funções auxiliares

In [8]:
def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;

#### VarianceThreshold

In [9]:
def selectVarianceThreshold(X_train, y_train, X_test):
    varianceThreshold_selector = VarianceThreshold()
    selector = varianceThreshold_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = varianceThreshold_selector.transform(X_train)
    X_test_selected = varianceThreshold_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectKBest

In [10]:
def selectKBest_f_classif(X_train, y_train, X_test):
    kbest_selector_f_classif = SelectKBest(f_classif, k=12)
    selector = kbest_selector_f_classif.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_f_classif.transform(X_train)
    X_test_selected = kbest_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectKBest_chi2(X_train, y_train, X_test):
    kbest_selector_chi2 = SelectKBest(chi2, k=12)
    selector = kbest_selector_chi2.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_chi2.transform(X_train)
    X_test_selected = kbest_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectPercentile

In [11]:
def selectPercentile_f_classif(X_train, y_train, X_test):
    percentile_selector_f_classif = SelectPercentile(f_classif, percentile=10)
    selector = percentile_selector_f_classif.fit(X_train, y_train)
    X_train_selected = percentile_selector_f_classif.transform(X_train)
    X_test_selected = percentile_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_chi2(X_train, y_train, X_test):
    percentile_selector_chi2 = SelectPercentile(chi2, percentile=10)
    selector = percentile_selector_chi2.fit(X_train, y_train)
    X_train_selected = percentile_selector_chi2.transform(X_train)
    X_test_selected = percentile_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### GenericUnivariateSelect

In [12]:
def selectGenericUnivariateSelect(X_train, y_train, X_test):
    gus_selector = GenericUnivariateSelect(chi2, 'k_best', param=19)
    selector = gus_selector.fit(X_train, y_train)
    X_train_selected = gus_selector.transform(X_train)
    X_test_selected = gus_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination

In [13]:
def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination w/ Cross Validation

In [14]:
def rfeCvLogReg(X_train, y_train, X_test):
    rfecv_log_selector = RFECV(LogisticRegression(), 12)
    selector = rfecv_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfecv_log_selector.transform(X_train)
    X_test_selected = rfecv_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def rfeCvSVC(X_train, y_train, X_test):
    rfecv_svc_selector = RFECV(SVC(kernel='linear'), 12)
    selector = rfecv_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfecv_svc_selector.transform(X_train)
    X_test_selected = rfecv_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectFromModel

In [15]:
def sfmLogReg(X_train, y_train, X_test):
    sfm_logReg_selector = SelectFromModel(estimator=LogisticRegression())
    selector = sfm_logReg_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_logReg_selector.transform(X_train)
    X_test_selected = sfm_logReg_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectFromModel and LassoCV

In [16]:
def sfmLcvLogReg(X_train, y_train, X_test):
    sfmlcv_logReg_selector = SelectFromModel(LassoCV(),threshold=0.25)
    selector = sfmlcv_logReg_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfmlcv_logReg_selector.transform(X_train)
    X_test_selected = sfmlcv_logReg_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### L1-based

In [17]:
def sfmL1(X_train, y_train, X_test):
    lsvc_selector = LinearSVC(C=0.01, penalty="l1", dual=False)
    selector = lsvc_selector.fit(X_train, y_train)
    l1_selector = SelectFromModel(lsvc_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = l1_selector.transform(X_train)
    X_test_selected = l1_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Tree-based

In [18]:
def sfmTree(X_train, y_train, X_test):
    tree_selector = ExtraTreesClassifier(n_estimators=50)
    selector = tree_selector.fit(X_train, y_train)
    sfm_Tree_selector = SelectFromModel(tree_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_Tree_selector.transform(X_train)
    X_test_selected = sfm_Tree_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#  Evaluate different models

In [19]:
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC

### Creating and Training the Models

In [21]:
# Removing features with low variance

TransformBalanceSelectTrainPredict( selectVarianceThreshold, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ VarianceThreshold")

TransformBalanceSelectTrainPredict( selectVarianceThreshold, SVC(), "SVC w/ VarianceThreshold")

TransformBalanceSelectTrainPredict( selectVarianceThreshold, SVC(kernel='linear'), "SVC Linear w/ VarianceThreshold")


print('\n')

# Univariate feature selection

TransformBalanceSelectTrainPredict( selectKBest_f_classif, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ KBest FClassif")

TransformBalanceSelectTrainPredict( selectKBest_chi2, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ KBest Chi2")

TransformBalanceSelectTrainPredict( selectPercentile_f_classif, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ Percentile FClassif")

TransformBalanceSelectTrainPredict( selectPercentile_chi2, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ Percentile Chi2")

TransformBalanceSelectTrainPredict( selectGenericUnivariateSelect, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ GUS ---")


TransformBalanceSelectTrainPredict( selectKBest_f_classif, SVC(), "SVC w/ KBest FClassif")

TransformBalanceSelectTrainPredict( selectKBest_chi2, SVC(), "SVC w/ KBest Chi2")

TransformBalanceSelectTrainPredict( selectPercentile_f_classif, SVC(), "SVC w/ Percentile FClassif")

TransformBalanceSelectTrainPredict( selectPercentile_chi2, SVC(), "SVC w/ Percentile Chi2")

TransformBalanceSelectTrainPredict( selectGenericUnivariateSelect, SVC(), "SVC w/ GUS ---")


TransformBalanceSelectTrainPredict( selectKBest_f_classif, SVC(kernel='linear'), "SVC Linear w/ KBest FClassif")

TransformBalanceSelectTrainPredict( selectKBest_chi2, SVC(kernel='linear'), "SVC Linear w/ KBest Chi2")

TransformBalanceSelectTrainPredict( selectPercentile_f_classif, SVC(kernel='linear'), "SVC Linear w/ Percentile FClassif")

TransformBalanceSelectTrainPredict( selectPercentile_chi2, SVC(kernel='linear'), "SVC Linear w/ Percentile Chi2")

TransformBalanceSelectTrainPredict( selectGenericUnivariateSelect, SVC(kernel='linear'), "SVC Linear w/ GUS ---")


print('\n')

# Recursive feature elimination

#TransformBalanceSelectTrainPredict( rfeLogReg,  LogisticRegression(), "Log Reg w/ RFE LogReg")

TransformBalanceSelectTrainPredict( rfeSVC, SVC(kernel='linear'), "SVC Linear w/ RFE SVC")

#TransformBalanceSelectTrainPredict( rfeCvLogReg, LogisticRegression(), "SVC Linear w/ RFE Cross Validation LogReg")

TransformBalanceSelectTrainPredict( rfeCvSVC, SVC(kernel='linear'), "SVC Linear w/ RFE Cross Validation SVC")


print('\n')

# Feature selection using SelectFromModel

#TransformBalanceSelectTrainPredict( sfmLogReg, LogisticRegression(), "Log Reg w/ SelectFromModel LogReg")

TransformBalanceSelectTrainPredict( sfmLcvLogReg, LogisticRegression(), "Log Reg w/ SelectFromModel and LassoCV")

TransformBalanceSelectTrainPredict( sfmL1, SVC(kernel='linear'), "SVC Linear w/ SelectFromModel L1")

TransformBalanceSelectTrainPredict( sfmTree, SVC(kernel='linear'), "SVC Linear w/ SelectFromModel Tree")

Prec 0: 0.187; Prec 1: 0.818; Rec 0: 0.318; Rec 1: 0.689; Acc: 0.621; -> KNeighborsClassifier w/ VarianceThreshold
Prec 0: 0.150; Prec 1: 0.810; Rec 0: 0.136; Rec 1: 0.827; Acc: 0.700; -> SVC w/ VarianceThreshold
Prec 0: 0.207; Prec 1: 0.829; Rec 0: 0.386; Rec 1: 0.668; Acc: 0.617; -> SVC Linear w/ VarianceThreshold


Prec 0: 0.173; Prec 1: 0.812; Rec 0: 0.295; Rec 1: 0.684; Acc: 0.613; -> KNeighborsClassifier w/ KBest FClassif
Prec 0: 0.183; Prec 1: 0.816; Rec 0: 0.341; Rec 1: 0.658; Acc: 0.600; -> KNeighborsClassifier w/ KBest Chi2
Prec 0: 0.229; Prec 1: 0.835; Rec 0: 0.364; Rec 1: 0.724; Acc: 0.658; -> KNeighborsClassifier w/ Percentile FClassif
Prec 0: 0.264; Prec 1: 0.840; Rec 0: 0.318; Rec 1: 0.801; Acc: 0.713; -> KNeighborsClassifier w/ Percentile Chi2
Prec 0: 0.206; Prec 1: 0.825; Rec 0: 0.295; Rec 1: 0.745; Acc: 0.662; -> KNeighborsClassifier w/ GUS ---
Prec 0: 0.207; Prec 1: 0.820; Rec 0: 0.136; Rec 1: 0.883; Acc: 0.746; -> SVC w/ KBest FClassif
Prec 0: 0.158; Prec 1: 0.812; 