In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.feature_selection import f_classif, chi2
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, GenericUnivariateSelect, RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier


### Load test and train data

In [3]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Funcoes auxiliares

#### Transformação

In [67]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;


#### Load balancing

In [12]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

def smotetomekSampler(X_train, y_train):
    smote_tomek = SMOTETomek(random_state=0)
    X_balanced, y_train = smote_tomek.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;


#### Feature selection

In [151]:
def selectKBest_f_classif(X_train, y_train, X_test):
    kbest_selector_f_classif = SelectKBest(f_classif, k=8)
    selector = kbest_selector_f_classif.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_f_classif.transform(X_train)
    X_test_selected = kbest_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectKBest_chi2(X_train, y_train, X_test):
    kbest_selector_chi2 = SelectKBest(chi2, k=8)
    selector = kbest_selector_chi2.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_chi2.transform(X_train)
    X_test_selected = kbest_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_f_classif(X_train, y_train, X_test):
    percentile_selector_f_classif = SelectPercentile(f_classif, percentile=25)
    selector = percentile_selector_f_classif.fit(X_train, y_train)
    X_train_selected = percentile_selector_f_classif.transform(X_train)
    X_test_selected = percentile_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_chi2(X_train, y_train, X_test):
    percentile_selector_chi2 = SelectPercentile(chi2, percentile=25)
    selector = percentile_selector_chi2.fit(X_train, y_train)
    X_train_selected = percentile_selector_chi2.transform(X_train)
    X_test_selected = percentile_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;


def selectVarianceThreshold(X_train, y_train, X_test):
    varianceThreshold_selector = VarianceThreshold()
    selector = varianceThreshold_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = varianceThreshold_selector.transform(X_train)
    X_test_selected = varianceThreshold_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectGenericUnivariateSelect(X_train, y_train, X_test):
    gus_selector = GenericUnivariateSelect(f_classif, 'k_best', param=19)
    selector = gus_selector.fit(X_train, y_train)
    X_train_selected = gus_selector.transform(X_train)
    X_test_selected = gus_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def sfmTree(X_train, y_train, X_test):
    tree_selector = ExtraTreesClassifier(n_estimators=50)
    selector = tree_selector.fit(X_train, y_train)
    sfm_Tree_selector = SelectFromModel(tree_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_Tree_selector.transform(X_train)
    X_test_selected = sfm_Tree_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Funções para grid search e aplicação das técnicas do pipeline

In [195]:
def gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=0)
    clf.fit(X_train,y_train)
    print(clf.best_params_)
    predicted = clf.predict(X_test)
    evaluateModel(modelName, y_test, predicted)
    return ;
    
    
def apllyGridSearchWithTransformation(model, transformer, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    
    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;


def apllyGridSearchWithFSelect(model, transformer, selector, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    X_train, X_test = selector(X_train, y_train, X_test)
    
    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;

def apllyGridSearchWithLoadBalancing(model, transformer, selector, balancer, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    X_train, y_train = balancer(X_train, y_train)
    X_train, X_test = selector(X_train, y_train, X_test)

    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;


## Avaliação dos modelos

#### SVC

In [208]:

param_grid_svc = {
    'class_weight': ['balanced', None], 
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001], 
    'kernel': ['rbf', 'linear']
} 
print("SVC com robust scaling + select variance threshold + smoteenSampler:")
apllyGridSearchWithLoadBalancing(SVC(), robustScaling2, selectPercentile_f_classif, overSampler, param_grid_svc, "SVC")
print("SVC com discretizacao e select selectPercentile_chi2 + over sampler:")
apllyGridSearchWithLoadBalancing(SVC(), discretize2, selectPercentile_chi2, overSampler, param_grid_svc, "SVC")
print("SVC com discretizacao e select selectVarianceThreshold + over sampler:")
apllyGridSearchWithLoadBalancing(SVC(), discretize2, selectVarianceThreshold, overSampler, param_grid_svc, "SVC")

SVC com robust scaling + select variance threshold + smoteenSampler:
{'C': 100, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.704 || AUROC 0.510 || (Accuracy, Precision) 0:( 0.205, 0.200)  1:( 0.816, 0.821) -> SVC
SVC com discretizacao e select selectPercentile_chi2 + over sampler:
{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.692 || AUROC 0.494 || (Accuracy, Precision) 0:( 0.182, 0.174)  1:( 0.806, 0.814) -> SVC
SVC com discretizacao e select selectVarianceThreshold + over sampler:
{'C': 1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.812 || AUROC 0.497 || (Accuracy, Precision) 0:( 0.000, 0.000)  1:( 0.995, 0.816) -> SVC


#### KNN

In [198]:
grid_params_knn = {
    'n_neighbors' : [3,5,7,11,13, 15, 17, 25, 30, 50],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectPercentile_chi2, overSampler, grid_params_knn, "knn")

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectPercentile_chi2, smotetomekSampler, grid_params_knn, "knn")

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectKBest_f_classif, overSampler, grid_params_knn, "knn")

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}
Accuracy: 0.667 || AUROC 0.540 || (Accuracy, Precision) 0:( 0.341, 0.227)  1:( 0.740, 0.833) -> knn
{'metric': 'euclidean', 'n_neighbors': 50, 'weights': 'distance'}
Accuracy: 0.637 || AUROC 0.531 || (Accuracy, Precision) 0:( 0.364, 0.213)  1:( 0.699, 0.830) -> knn
{'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
Accuracy: 0.688 || AUROC 0.535 || (Accuracy, Precision) 0:( 0.295, 0.228)  1:( 0.776, 0.831) -> knn


#### Random Forest

In [199]:
grid_params_randomforest = {
    'n_estimators' : [10,20,30,50,100,200,1000],
    'max_depth' : [1, 10, 20, None],
    'bootstrap': [True, False],
}

apllyGridSearchWithLoadBalancing(RandomForestClassifier(), discretize2, selectPercentile_chi2, overSampler, grid_params_randomforest, "Random Forest")


{'bootstrap': False, 'max_depth': 20, 'n_estimators': 100}
Accuracy: 0.704 || AUROC 0.528 || (Accuracy, Precision) 0:( 0.250, 0.224)  1:( 0.806, 0.827) -> Random Forest


#### MultiLayer Perceptron

In [200]:
grid_params_mlp = {
    'solver': ['adam'],
    'activation': ['identity', 'logistic','tanh'],
    'max_iter': [1000],
    'shuffle': [True],
    'alpha': 10.0 ** -np.arange(3, 7),
    'hidden_layer_sizes': [100, 150, 200],
}

apllyGridSearchWithLoadBalancing(MLPClassifier(), discretize2, selectPercentile_chi2, overSampler, grid_params_mlp, "mlp")

{'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': 200, 'max_iter': 1000, 'shuffle': True, 'solver': 'adam'}
Accuracy: 0.713 || AUROC 0.533 || (Accuracy, Precision) 0:( 0.250, 0.234)  1:( 0.816, 0.829) -> mlp


#### Categorical NB

In [201]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

X_train, X_test = discretize2(X_train, X_test)
X_train, X_test = selectPercentile_chi2(X_train, y_train, X_test)
X_train, y_train = overSampler(X_train, y_train)

clf = CategoricalNB()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), "categoricalnb")

Accuracy: 0.729 || AUROC 0.570 || (Accuracy, Precision) 0:( 0.318, 0.286)  1:( 0.821, 0.843) -> categoricalnb


#### Decision Tree

In [202]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), "categoricalnb")
    

Accuracy: 0.733 || AUROC 0.564 || (Accuracy, Precision) 0:( 0.295, 0.283)  1:( 0.832, 0.840) -> categoricalnb


### Voting classifier

In [192]:
X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

X_train, X_test = discretize2(X_train, X_test)
X_train, X_test = selectPercentile_chi2(X_train, y_train, X_test)
X_train, y_train = overSampler(X_train, y_train)

clf1 = LogisticRegression()
clf2 = SVC(C= 100, class_weight = 'balanced', gamma= 0.1, kernel= 'rbf')
clf3 = KNeighborsClassifier(metric= 'manhattan', n_neighbors = 17, weights = 'distance')
clf4 = CategoricalNB()
clf5 = RandomForestClassifier(bootstrap= False, max_depth= 10, n_estimators= 2000)

eclf1 = VotingClassifier(estimators=[
    ('lr', clf1), ('svc', clf2), ('knn', clf3), ('cnb', clf4), ('rfc', clf5)], voting='hard')

eclf1 = eclf1.fit(X_train, y_train)

predicted = eclf1.predict(X_test)

print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), "voting classifier")
    

Accuracy: 0.721 || AUROC 0.529 || (Accuracy, Precision) 0:( 0.227, 0.233)  1:( 0.832, 0.827) -> voting classifier
