### Import Libraries

In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

### Load train and test dataset

In [135]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '.') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(float)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)


# Funções auxiliares

In [136]:
def TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, transformer, balancer, featureSelector, model, name):
    # Normalizar, discretizar ou standardizar
    X_train_transformed, X_test_transformed = transformer(X_train, X_test)
    
    # Balancear Data Set
    X_train_balanced, y_train_balanced = balancer(X_train_transformed, y_train)
    
    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train_balanced, y_train_balanced, X_test_transformed)
    
    # Treinar modelo
    model.fit(X_train_selected, y_train_balanced)
    
    # Prever resultados para test set
    predicted = model.predict(X_test_selected)
    
    # Avaliar modelo
    print(str(accuracy_score(y_test, predicted)))
    print(classification_report(y_test, predicted))
    
    return;



# Balance Dataset

### Up-sample minority class

#### Resample with replacement

Método mais simples que consiste em replicar aleatoriamente (com reposição) dados da classe minoritária até atingir ratio de 1:1

In [137]:
from imblearn.over_sampling import RandomOverSampler

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [138]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Down-sample majority class

#### Resample without replacement

In [139]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;


#### Tomek Links

In [140]:
from imblearn.under_sampling import TomekLinks

def tomekSampler(X_train, y_train):
    tl = TomekLinks(sampling_strategy='majority')
    X_balanced, y_train = tl.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids

In [141]:
from imblearn.under_sampling import ClusterCentroids

def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

# Data transformation

### Discretization

In [142]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

### Standardization

In [143]:
from sklearn.preprocessing import StandardScaler

def scalerFunc(X_train, X_test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    X_test_transformed = scaler.transform( X_test )
    return scaled_data, X_test_transformed;

# Feature Selection

### Funções auxiliares

In [144]:
def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;



#### SelectKBest (Filter)

In [145]:
from sklearn.feature_selection import SelectKBest, f_classif

def selectKBest(X_train, y_train, X_test):
    kbest_selector = SelectKBest(f_classif, k=12)
    selector = kbest_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = kbest_selector.transform(X_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination (Wrapper)

In [146]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    rfe_log_selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    rfe_svc_selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#  Evaluate different models

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Creating and Training the Models

In [148]:
# Transformer : discretize , scalerFunc
# Balancer: overSampler , smoteSampler , underSampler , tomekSampler , centroidSampler
# FeatureSelector : selectKBest , rfeLogReg , rfeSVC

TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, overSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, smoteSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, underSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, tomekSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, centroidSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, overSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, smoteSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, underSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, tomekSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, centroidSampler, selectKBest, SVC(), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, overSampler, rfeLogReg,  LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, smoteSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, underSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, tomekSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, centroidSampler, rfeLogReg, LogisticRegression(), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, overSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, smoteSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, underSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, tomekSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, discretize, centroidSampler, rfeSVC, SVC(kernel='linear'), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, overSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, smoteSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, underSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, tomekSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, centroidSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, overSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, smoteSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, underSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, tomekSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, centroidSampler, selectKBest, SVC(), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, overSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, smoteSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, underSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, tomekSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, centroidSampler, rfeLogReg, LogisticRegression(), "test")


TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, overSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, smoteSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, underSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, tomekSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict(X_train, y_train, X_test, y_test, scalerFunc, centroidSampler, rfeSVC, SVC(kernel='linear'), "test")


     Reason for absence  Month of absence  Day of the week  Seasons  \
ID                                                                    
1                    26                 7                3        1   
2                     0                 7                3        1   
3                    23                 7                4        1   
4                     7                 7                5        1   
5                    23                 7                5        1   
..                  ...               ...              ...      ...   
496                  28                 9                3        1   
497                  28                 9                3        1   
498                  28                 9                3        1   
499                  23                 9                3        1   
500                  28                 9                5        1   

     Transportation expense  Distance from Residence to Work  Service time  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

0.6708333333333333
              precision    recall  f1-score   support

           0       0.20      0.27      0.23        44
           1       0.82      0.76      0.79       196

    accuracy                           0.67       240
   macro avg       0.51      0.52      0.51       240
weighted avg       0.71      0.67      0.69       240



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

0.7125
              precision    recall  f1-score   support

           0       0.25      0.30      0.27        44
           1       0.84      0.81      0.82       196

    accuracy                           0.71       240
   macro avg       0.55      0.55      0.55       240
weighted avg       0.73      0.71      0.72       240

0.6375
              precision    recall  f1-score   support

           0       0.20      0.32      0.24        44
           1       0.82      0.71      0.76       196

    accuracy                           0.64       240
   macro avg       0.51      0.51      0.50       240
weighted avg       0.71      0.64      0.67       240



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

0.7958333333333333
              precision    recall  f1-score   support

           0       0.38      0.18      0.25        44
           1       0.84      0.93      0.88       196

    accuracy                           0.80       240
   macro avg       0.61      0.56      0.56       240
weighted avg       0.75      0.80      0.77       240



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

0.45
              precision    recall  f1-score   support

           0       0.19      0.64      0.30        44
           1       0.83      0.41      0.55       196

    accuracy                           0.45       240
   macro avg       0.51      0.52      0.42       240
weighted avg       0.72      0.45      0.50       240

0.6166666666666667
              precision    recall  f1-score   support

           0       0.23      0.48      0.31        44
           1       0.85      0.65      0.73       196

    accuracy                           0.62       240
   macro avg       0.54      0.56      0.52       240
weighted avg       0.73      0.62      0.66       240

0.6125
              precision    recall  f1-score   support

           0       0.18      0.32      0.23        44
           1       0.82      0.68      0.74       196

    accuracy                           0.61       240
   macro avg       0.50      0.50      0.49       240
weighted avg       0.70      0.61      0.65

0.55
              precision    recall  f1-score   support

           0       0.21      0.52      0.30        44
           1       0.84      0.56      0.67       196

    accuracy                           0.55       240
   macro avg       0.52      0.54      0.48       240
weighted avg       0.72      0.55      0.60       240



### Random Forest

Abordagem com bagging através de random forests para superar problema de dataset desbalanceado

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )

kbest_selector = createKBestSelector(data, target, 10)
selected = kbest_selector.transform(data)

X_test_scaled = scaler.transform(X_test)
X_test_selected = kbest_selector.transform(X_test_scaled)



clf = RandomForestClassifier()
clf.fit(selected, target)
 
pred = clf.predict(X_test_selected)
 
print( accuracy_score(y_test, pred) )
print(classification_report(y_test, pred))

0.725
              precision    recall  f1-score   support

           0       0.13      0.09      0.11        44
           1       0.81      0.87      0.84       196

    accuracy                           0.73       240
   macro avg       0.47      0.48      0.47       240
weighted avg       0.69      0.72      0.70       240



### Cost-Sensitive Training

In [48]:
from sklearn.svm import SVC


svc = SVC(kernel='linear', 
            class_weight='balanced', 
            probability=True)

svc.fit(selected, target)

predSvc = svc.predict(X_test_selected)
 
print( accuracy_score(y_test, predSvc) )
print(classification_report(y_test, predSvc))

0.6083333333333333
              precision    recall  f1-score   support

           0       0.16      0.27      0.20        44
           1       0.81      0.68      0.74       196

    accuracy                           0.61       240
   macro avg       0.48      0.48      0.47       240
weighted avg       0.69      0.61      0.64       240



### Adaboosting

In [57]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )

X_test_scaled = scaler.transform(X_test)

classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)


classifier.fit(data, target)

predAda = classifier.predict(X_test_scaled)
 
print( accuracy_score(y_test, predAda) )
print(classification_report(y_test, predAda))

0.7375
              precision    recall  f1-score   support

           0       0.19      0.14      0.16        44
           1       0.82      0.87      0.84       196

    accuracy                           0.74       240
   macro avg       0.51      0.50      0.50       240
weighted avg       0.70      0.74      0.72       240

