### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
%matplotlib inline

### Load train and test dataset

In [2]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)



# Funções auxiliares

In [3]:
def TransformBalanceSelectTrainPredict( transformer, balancer, featureSelector, model, name):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

    # Normalizar, discretizar ou standardizar
    X_train_transformed, X_test_transformed = transformer(X_train, X_test)
    
    # Balancear Data Set
    X_train_balanced, y_train_balanced = balancer(X_train_transformed, y_train)
    
    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train_balanced, y_train_balanced, X_test_transformed)
    
    # Treinar modelo
    model.fit(X_train_selected, y_train_balanced)
    
    # Prever resultados para test set
    predicted = model.predict(X_test_selected)
    
    # Avaliar modelo
    evaluateModel(name, y_test, predicted)
    return;

def evaluateModel(name, y_test, predicted):
    print("".join(["Precision 0: ",'%.3f' % precision_score(y_test,predicted,pos_label=0),
                   "; Precision 1: ",'%.3f' % precision_score(y_test,predicted,pos_label=1),
                   "; Recall 0: ",'%.3f' % recall_score(y_test,predicted,pos_label=0),
                   "; Recall 1: ",'%.3f' % recall_score(y_test,predicted,pos_label=1),
                  "; Accuracy: ",'%.3f' % accuracy_score(y_test,predicted), "; -> " , name]))
    return;


# Balance Dataset

### Up-sample minority class

#### Resample with replacement

Método mais simples que consiste em replicar aleatoriamente (com reposição) dados da classe minoritária até atingir ratio de 1:1

In [4]:
from imblearn.over_sampling import RandomOverSampler

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### SMOTE - Synthetic Minority Over-sampling Technique

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')

def smoteSampler(X_train, y_train):
    smote = SMOTE(sampling_strategy='minority')
    X_balanced, y_train = smote.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

### Down-sample majority class

#### Resample without replacement

In [6]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

def underSampler(X_train, y_train):
    rus = RandomUnderSampler()
    X_balanced, y_train = rus.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;


#### Tomek Links

In [7]:
from imblearn.under_sampling import TomekLinks

def tomekSampler(X_train, y_train):
    tl = TomekLinks(sampling_strategy='majority')
    X_balanced, y_train = tl.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

#### Cluster Centroids

In [8]:
from imblearn.under_sampling import ClusterCentroids

def centroidSampler(X_train, y_train):
    cc = ClusterCentroids(sampling_strategy='majority')
    X_balanced, y_train = cc.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

# Data transformation

### Discretization

In [9]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

### Standardization

In [10]:
from sklearn.preprocessing import StandardScaler

def scalerFunc(X_train, X_test): 
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform( X_train )
    X_test_transformed = scaler.transform( X_test )
    return scaled_data, X_test_transformed;

# Feature Selection

### Funções auxiliares

In [11]:
def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;



#### SelectKBest (Filter)

In [12]:
from sklearn.feature_selection import SelectKBest, f_classif

def selectKBest(X_train, y_train, X_test):
    kbest_selector = SelectKBest(f_classif, k=12)
    selector = kbest_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = kbest_selector.transform(X_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination (Wrapper)

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    rfe_log_selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    rfe_svc_selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#  Evaluate different models

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Creating and Training the Models

In [15]:
# Transformer : discretize , scalerFunc
# Balancer: overSampler , smoteSampler , underSampler , tomekSampler , centroidSampler
# FeatureSelector : selectKBest , rfeLogReg , rfeSVC

TransformBalanceSelectTrainPredict( discretize, overSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( discretize, smoteSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( discretize, underSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( discretize, tomekSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( discretize, centroidSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")

TransformBalanceSelectTrainPredict( discretize, overSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( discretize, smoteSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( discretize, underSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( discretize, tomekSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( discretize, centroidSampler, selectKBest, SVC(), "test")


TransformBalanceSelectTrainPredict( discretize, overSampler, rfeLogReg,  LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( discretize, smoteSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( discretize, underSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( discretize, tomekSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( discretize, centroidSampler, rfeLogReg, LogisticRegression(), "test")


TransformBalanceSelectTrainPredict( discretize, overSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( discretize, smoteSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( discretize, underSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( discretize, tomekSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( discretize, centroidSampler, rfeSVC, SVC(kernel='linear'), "test")


TransformBalanceSelectTrainPredict( scalerFunc, overSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( scalerFunc, smoteSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( scalerFunc, underSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( scalerFunc, tomekSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")
TransformBalanceSelectTrainPredict( scalerFunc, centroidSampler, selectKBest, KNeighborsClassifier(n_neighbors=5), "test")


TransformBalanceSelectTrainPredict( scalerFunc, overSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, smoteSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, underSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, tomekSampler, selectKBest, SVC(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, centroidSampler, selectKBest, SVC(), "test")


TransformBalanceSelectTrainPredict( scalerFunc, overSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, smoteSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, underSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, tomekSampler, rfeLogReg, LogisticRegression(), "test")
TransformBalanceSelectTrainPredict( scalerFunc, centroidSampler, rfeLogReg, LogisticRegression(), "test")


TransformBalanceSelectTrainPredict( scalerFunc, overSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( scalerFunc, smoteSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( scalerFunc, underSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( scalerFunc, tomekSampler, rfeSVC, SVC(kernel='linear'), "test")
TransformBalanceSelectTrainPredict( scalerFunc, centroidSampler, rfeSVC, SVC(kernel='linear'), "test")


Precision 0: 0.175; Precision 1: 0.814; Recall 0: 0.227; Recall 1: 0.760; Accuracy: 0.662; -> test
Precision 0: 0.250; Precision 1: 0.841; Recall 0: 0.364; Recall 1: 0.755; Accuracy: 0.683; -> test
Precision 0: 0.215; Precision 1: 0.837; Recall 0: 0.455; Recall 1: 0.628; Accuracy: 0.596; -> test
Precision 0: 0.217; Precision 1: 0.820; Recall 0: 0.114; Recall 1: 0.908; Accuracy: 0.762; -> test
Precision 0: 0.217; Precision 1: 0.834; Recall 0: 0.409; Recall 1: 0.668; Accuracy: 0.621; -> test
Precision 0: 0.185; Precision 1: 0.817; Recall 0: 0.273; Recall 1: 0.730; Accuracy: 0.646; -> test
Precision 0: 0.205; Precision 1: 0.826; Recall 0: 0.341; Recall 1: 0.704; Accuracy: 0.637; -> test
Precision 0: 0.211; Precision 1: 0.822; Recall 0: 0.182; Recall 1: 0.847; Accuracy: 0.725; -> test
Precision 0: 0.312; Precision 1: 0.826; Recall 0: 0.114; Recall 1: 0.944; Accuracy: 0.792; -> test
Precision 0: 0.208; Precision 1: 0.833; Recall 0: 0.455; Recall 1: 0.612; Accuracy: 0.583; -> test


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Precision 0: 0.226; Precision 1: 0.829; Recall 0: 0.273; Recall 1: 0.791; Accuracy: 0.696; -> test


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Precision 0: 0.211; Precision 1: 0.825; Recall 0: 0.273; Recall 1: 0.770; Accuracy: 0.679; -> test


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Precision 0: 0.223; Precision 1: 0.842; Recall 0: 0.477; Recall 1: 0.628; Accuracy: 0.600; -> test


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Precision 0: 0.381; Precision 1: 0.836; Recall 0: 0.182; Recall 1: 0.934; Accuracy: 0.796; -> test


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Precision 0: 0.195; Precision 1: 0.832; Recall 0: 0.591; Recall 1: 0.454; Accuracy: 0.479; -> test
Precision 0: 0.222; Precision 1: 0.848; Recall 0: 0.545; Recall 1: 0.571; Accuracy: 0.567; -> test
Precision 0: 0.159; Precision 1: 0.811; Recall 0: 0.159; Recall 1: 0.811; Accuracy: 0.692; -> test
Precision 0: 0.221; Precision 1: 0.838; Recall 0: 0.432; Recall 1: 0.658; Accuracy: 0.617; -> test
Precision 0: 0.375; Precision 1: 0.823; Recall 0: 0.068; Recall 1: 0.974; Accuracy: 0.808; -> test
Precision 0: 0.189; Precision 1: 0.824; Recall 0: 0.568; Recall 1: 0.454; Accuracy: 0.475; -> test
Precision 0: 0.189; Precision 1: 0.819; Recall 0: 0.318; Recall 1: 0.694; Accuracy: 0.625; -> test
Precision 0: 0.132; Precision 1: 0.793; Recall 0: 0.227; Recall 1: 0.663; Accuracy: 0.583; -> test
Precision 0: 0.217; Precision 1: 0.834; Recall 0: 0.409; Recall 1: 0.668; Accuracy: 0.621; -> test
Precision 0: 0.294; Precision 1: 0.825; Recall 0: 0.114; Recall 1: 0.939; Accuracy: 0.787; -> test
Precision 

### Random Forest

Abordagem com bagging através de random forests para superar problema de dataset desbalanceado

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )
X_test_scaled = scaler.transform(X_test)


clf = RandomForestClassifier()
clf.fit(data, target)
 
pred = clf.predict(X_test_scaled)
 
print( accuracy_score(y_test, pred) )
print(classification_report(y_test, pred))

0.7833333333333333
              precision    recall  f1-score   support

           0       0.28      0.11      0.16        44
           1       0.82      0.93      0.88       196

    accuracy                           0.78       240
   macro avg       0.55      0.52      0.52       240
weighted avg       0.72      0.78      0.74       240



### Cost-Sensitive Training

In [17]:
from sklearn.svm import SVC


svc = SVC(kernel='linear', 
            class_weight='balanced', 
            probability=True)

svc.fit(selected, target)

predSvc = svc.predict(X_test_selected)
 
print( accuracy_score(y_test, predSvc) )
print(classification_report(y_test, predSvc))

NameError: name 'selected' is not defined

### Adaboosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )

X_test_scaled = scaler.transform(X_test)

classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)


classifier.fit(data, target)

predAda = classifier.predict(X_test_scaled)
 
print( accuracy_score(y_test, predAda) )
print(classification_report(y_test, predAda))