### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
%matplotlib inline

### Load train and test dataset

In [2]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)



# Funções auxiliares

In [3]:
def TransformBalanceSelectTrainPredict(featureSelector, model, name):
    
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

    # Normalizar, discretizar ou standardizar
    X_train_transformed, X_test_transformed = discretize(X_train, X_test)
    
    # Balancear Data Set
    X_train_balanced, y_train_balanced = overSampler(X_train_transformed, y_train)
    
    # Feature Selection
    X_train_selected, X_test_selected = featureSelector(X_train_balanced, y_train_balanced, X_test_transformed)
    
    # Treinar modelo
    model.fit(X_train_selected, y_train_balanced)
    
    # Prever resultados para test set
    predicted = model.predict(X_test_selected)
    
    # Avaliar modelo
    evaluateModel(name, y_test, predicted)
    return;

def evaluateModel(name, y_test, predicted):
    print("".join(["Prec 0: ",'%.3f' % precision_score(y_test,predicted,pos_label=0),
                   "; Prec 1: ",'%.3f' % precision_score(y_test,predicted,pos_label=1),
                   "; Rec 0: ",'%.3f' % recall_score(y_test,predicted,pos_label=0),
                   "; Rec 1: ",'%.3f' % recall_score(y_test,predicted,pos_label=1),
                  "; Acc: ",'%.3f' % accuracy_score(y_test,predicted), "; -> " , name]))
    return;


# Balance Dataset

### XXXXX Up-sample minority class

#### xxxx Resample with replacement

In [4]:
from imblearn.over_sampling import RandomOverSampler

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

# Data transformation

### xxxx Discretization

In [5]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;

# Feature Selection

### Funções auxiliares

In [6]:
def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;

#### VarianceThreshold

In [7]:
from sklearn.feature_selection import VarianceThreshold

def selectVarianceThreshold(X_train, y_train, X_test):
    varianceThreshold_selector = VarianceThreshold()
    selector = varianceThreshold_selector.fit(X_train, y_train)
    X_train_selected = varianceThreshold_selector.transform(X_train)
    X_test_selected = varianceThreshold_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectKBest (Filter)

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

def selectKBest(X_train, y_train, X_test):
    kbest_selector = SelectKBest(f_classif, k=12)
    selector = kbest_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = kbest_selector.transform(X_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### SelectPercentile

In [9]:
from sklearn.feature_selection import SelectPercentile, f_classif

def selectPercentile(X_train, y_train, X_test):
    percentile_selector = SelectPercentile(f_classif, percentile=10)
    selector = percentile_selector.fit(X_train, y_train)
    X_train_selected = percentile_selector.transform(X_train)
    X_test_selected = percentile_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### GenericUnivariateSelect

In [10]:
from sklearn.feature_selection import GenericUnivariateSelect, chi2

def selectGenericUnivariateSelect(X_train, y_train, X_test):
    gus_selector = GenericUnivariateSelect(chi2, 'k_best', param=19)
    selector = gus_selector.fit(X_train, y_train)
    X_train_selected = gus_selector.transform(X_train)
    X_test_selected = gus_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Recursive Feature Elimination (Wrapper)

In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    rfe_log_selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    rfe_svc_selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#  Evaluate different models

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Creating and Training the Models

In [13]:
# FeatureSelector : selectVarianceThreshold, selectKBest, rfeLogReg, rfeSVC

TransformBalanceSelectTrainPredict( selectVarianceThreshold, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ VarianceThreshold")

TransformBalanceSelectTrainPredict( selectKBest, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ KBest")

TransformBalanceSelectTrainPredict( selectPercentile, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ Percentile")

TransformBalanceSelectTrainPredict( selectGenericUnivariateSelect, KNeighborsClassifier(n_neighbors=5), "KNeighborsClassifier w/ GUS")



TransformBalanceSelectTrainPredict( selectVarianceThreshold, SVC(), "SVC w/ VarianceThreshold")

TransformBalanceSelectTrainPredict( selectKBest, SVC(), "SVC w/ KBest")



TransformBalanceSelectTrainPredict( selectVarianceThreshold, SVC(kernel='linear'), "SVC Linear w/ VarianceThreshold")

TransformBalanceSelectTrainPredict( selectKBest, SVC(kernel='linear'), "SVC Linear w/ KBest")



#TransformBalanceSelectTrainPredict( rfeLogReg,  LogisticRegression(), "test")

TransformBalanceSelectTrainPredict( rfeSVC, SVC(kernel='linear'), "SVC Linear w/ RFE SVC")


Prec 0: 0.213; Prec 1: 0.830; Rec 0: 0.364; Rec 1: 0.699; Acc: 0.637; -> KNeighborsClassifier w/ VarianceThreshold
Prec 0: 0.188; Prec 1: 0.819; Rec 0: 0.364; Rec 1: 0.648; Acc: 0.596; -> KNeighborsClassifier w/ KBest
Prec 0: 0.198; Prec 1: 0.824; Rec 0: 0.364; Rec 1: 0.668; Acc: 0.613; -> KNeighborsClassifier w/ Percentile
Prec 0: 0.194; Prec 1: 0.821; Rec 0: 0.295; Rec 1: 0.724; Acc: 0.646; -> KNeighborsClassifier w/ GUS
Prec 0: 0.176; Prec 1: 0.816; Rec 0: 0.136; Rec 1: 0.857; Acc: 0.725; -> SVC w/ VarianceThreshold
Prec 0: 0.222; Prec 1: 0.822; Rec 0: 0.136; Rec 1: 0.893; Acc: 0.754; -> SVC w/ KBest
Prec 0: 0.225; Prec 1: 0.838; Rec 0: 0.409; Rec 1: 0.684; Acc: 0.633; -> SVC Linear w/ VarianceThreshold
Prec 0: 0.152; Prec 1: 0.805; Rec 0: 0.227; Rec 1: 0.714; Acc: 0.625; -> SVC Linear w/ KBest
Prec 0: 0.235; Prec 1: 0.868; Rec 0: 0.636; Rec 1: 0.536; Acc: 0.554; -> SVC Linear w/ RFE SVC
