### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Load train dataset

In [2]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '.') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(float)

# Balance Dataset

In [11]:
df_majority = AbsenteeismAtWork[AbsenteeismAtWork.Absent==1]
df_minority = AbsenteeismAtWork[AbsenteeismAtWork.Absent==0]

## Up-sample minority class

### Resample with replacement

Método mais simples que consiste em replicar aleatoriamente (com reposição) dados da classe minoritária até atingir ratio de 1:1

In [12]:
from sklearn.utils import resample
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=395,    
                                 random_state=123) 
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
print(df_upsampled.Absent.value_counts())

df_upsampled_data = df_upsampled.drop('Absent', 1)
df_upsampled_target = df_upsampled['Absent']

1    395
0    395
Name: Absent, dtype: int64


### SMOTE - Synthetic Minority Over-sampling Technique

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(data, target)

print(y_sm.value_counts())

1    395
0    395
Name: Absent, dtype: int64


## Down-sample majority class

### Resample without replacement

In [14]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=105,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
print(df_downsampled.Absent.value_counts())

df_downsampled_data = df_downsampled.drop('Absent', 1)
df_downsampled_target = df_downsampled['Absent']


1    105
0    105
Name: Absent, dtype: int64


### Tomek Links

In [15]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl= tl.fit_sample(data, target)

y_tl.value_counts()

1    360
0    105
Name: Absent, dtype: int64

### Cluster Centroids

In [16]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(sampling_strategy='majority')
X_cc, y_cc = cc.fit_sample(data, target)
y_cc.value_counts()

1    105
0    105
Name: Absent, dtype: int64

# Discretization

In [17]:
from sklearn.preprocessing import KBinsDiscretizer

def discretize(data):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    data[featuresToDiscretize] = discretizer.fit_transform(data[featuresToDiscretize])
    return data;


df_upsampled = discretize(df_upsampled)
X_sm = discretize(X_sm)
df_downsampled = discretize(df_downsampled)
X_tl = discretize(X_tl)
X_cc = discretize(X_cc)

# Feature Selection

### SelectKBest (Filter)

In [18]:
from sklearn.feature_selection import SelectKBest, f_classif


def createKBestSelector(data, target, numFeatures):
    kbest_selector = SelectKBest(f_classif, k=numFeatures)
    kbest_selector = kbest_selector.fit(data, target)
    return kbest_selector;


def getPickedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = list(data.columns[selected_features_index])
    return selected_features_names;


def getDroppedFeatures(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    dropped_features_names = list(data.columns[dropped_features_index])
    return dropped_features_names;


def printFeatureSelection(selector, data):
    selected_features_index = selector.get_support(indices=True)
    dropped_features_index = list( set(list(range(0, data.columns.size))) - (set(selected_features_index)))

    selected_features_names = zip(selected_features_index,  list(data.columns[selected_features_index]))
    dropped_features_names = zip(dropped_features_index, list(data.columns[dropped_features_index]))

    print("Features mantidas:")
    for cn in selected_features_names:
        print("\t" + str(cn))

    print("Features eliminadas:")
    for cn in dropped_features_names:
        print("\t" + str(cn))
    return;

numFeaturesToSelect = 12

print("##### SelectKBest for randomly upsampled data #####")
df_upsampled_kbest_selector =  createKBestSelector(df_upsampled_data, df_upsampled_target, numFeaturesToSelect)
printFeatureSelection(df_upsampled_kbest_selector, df_upsampled_data)

print("\n##### SelectKBest for SMOTE #####")
smote_kbest_selector =  createKBestSelector(X_sm, y_sm, numFeaturesToSelect)
printFeatureSelection(smote_kbest_selector, X_sm)


print("\n##### SelectKBest for randomly downsampled data #####")
df_downsampled_kbest_selector =  createKBestSelector(df_downsampled_data, df_downsampled_target, numFeaturesToSelect)
printFeatureSelection(df_downsampled_kbest_selector, df_downsampled_data)

print("\n##### SelectKBest for Tomek Links #####")
tomek_kbest_selector =  createKBestSelector(X_tl, y_tl, numFeaturesToSelect)
printFeatureSelection(tomek_kbest_selector, X_tl)


print("\n##### SelectKBest for Cluster Centroids #####")
cluster_kbest_selector =  createKBestSelector(X_cc, y_cc, numFeaturesToSelect)
printFeatureSelection(cluster_kbest_selector, X_cc)

##### SelectKBest for randomly upsampled data #####
Features mantidas:
	(0, 'Reason for absence')
	(1, 'Month of absence')
	(3, 'Seasons')
	(4, 'Transportation expense')
	(6, 'Service time')
	(7, 'Age')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(12, 'Son')
	(13, 'Social drinker')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(2, 'Day of the week')
	(5, 'Distance from Residence to Work')
	(8, 'Work load Average/day ')
	(11, 'Education')
	(14, 'Social smoker')
	(15, 'Pet')
	(17, 'Height')

##### SelectKBest for SMOTE #####
Features mantidas:
	(0, 'Reason for absence')
	(2, 'Day of the week')
	(3, 'Seasons')
	(5, 'Distance from Residence to Work')
	(7, 'Age')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(1, 'Month of absence')
	(4, 'Transportation expense')
	(6, 'Service time')
	(8, 'Work load Average/day ')
	(11, 'Education')
	(15, 'Pet')

### Recursive Feature Elimination (Wrapper)

In [19]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


print("##### RFE with Logistic Regression for randomly upsampled data #####")
df_upsampled_rfe_log_selector = RFE(LogisticRegression(), 12)
df_upsampled_rfe_log_selector = df_upsampled_rfe_log_selector.fit(df_upsampled_data, df_upsampled_target)
printFeatureSelection(df_upsampled_rfe_log_selector, df_upsampled_data)

print("##### RFE with Linear SVC for randomly upsampled data #####")
df_upsampled_rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
df_upsampled_rfe_svc_selector = df_upsampled_rfe_svc_selector.fit(df_upsampled_data, df_upsampled_target)
printFeatureSelection(df_upsampled_rfe_svc_selector, df_upsampled_data)

print("\n##### RFE with Logistic Regression for SMOTE #####")
smote_rfe_log_selector = RFE(LogisticRegression(), 12)
smote_rfe_log_selector = smote_rfe_log_selector.fit(data, target)
printFeatureSelection(smote_rfe_log_selector, X_sm)

print("\n##### RFE with Linear SVC for SMOTE #####")
smote_rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
smote_rfe_svc_selector = smote_rfe_svc_selector.fit(X_sm, y_sm)
printFeatureSelection(smote_rfe_svc_selector, X_sm)


print("\n##### RFE with Logistic Regression for randomly downsampled data #####")
df_downsampled_rfe_log_selector = RFE(LogisticRegression(), 12)
df_downsampled_rfe_log_selector = df_downsampled_rfe_log_selector.fit(df_downsampled_data, df_downsampled_target)
printFeatureSelection(df_downsampled_rfe_log_selector, df_downsampled_data)


print("\n##### RFE with Linear SVC for randomly downsampled data #####")
df_downsampled_rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
df_downsampled_rfe_svc_selector = df_downsampled_rfe_svc_selector.fit(df_downsampled_data, df_downsampled_target)
printFeatureSelection(df_downsampled_rfe_svc_selector, df_downsampled_data)


print("\n##### RFE with Logistic Regression for Tomek Links #####")
tomek_rfe_log_selector = RFE(LogisticRegression(), 12)
tomek_rfe_log_selector = tomek_rfe_log_selector.fit(X_tl, y_tl)
printFeatureSelection(tomek_rfe_log_selector, X_tl)


print("\n##### RFE with Linear SVC for Tomek Links #####")
tomek_rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
tomek_rfe_svc_selector = tomek_rfe_svc_selector.fit(X_tl, y_tl)
printFeatureSelection(tomek_rfe_svc_selector, X_tl)


print("\n##### RFE with Logistic Regression for Cluster Centroids #####")
cluster_rfe_log_selector = RFE(LogisticRegression(), 12)
cluster_rfe_log_selector = cluster_rfe_log_selector.fit(X_cc, y_cc)
printFeatureSelection(cluster_rfe_log_selector, X_cc)


print("\n##### RFE with Linear SVC for Cluster Centroids #####")
cluster_rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
cluster_rfe_svc_selector = cluster_rfe_svc_selector.fit(X_cc, y_cc)
printFeatureSelection(cluster_rfe_svc_selector, X_cc)




##### RFE with Logistic Regression for randomly upsampled data #####


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Features mantidas:
	(0, 'Reason for absence')
	(3, 'Seasons')
	(6, 'Service time')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(1, 'Month of absence')
	(2, 'Day of the week')
	(4, 'Transportation expense')
	(5, 'Distance from Residence to Work')
	(7, 'Age')
	(8, 'Work load Average/day ')
	(17, 'Height')
##### RFE with Linear SVC for randomly upsampled data #####
Features mantidas:
	(0, 'Reason for absence')
	(5, 'Distance from Residence to Work')
	(6, 'Service time')
	(7, 'Age')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(1, 'Month of absence')
	(2, 'Day of the week')
	(3, 'Seasons')
	(4, 'Transportation expense')
	(8, 'Work load Average/day ')
	(9, 'Hit target')
	(17, 'Height')

##### RFE

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Features mantidas:
	(3, 'Seasons')
	(6, 'Service time')
	(7, 'Age')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(0, 'Reason for absence')
	(1, 'Month of absence')
	(2, 'Day of the week')
	(4, 'Transportation expense')
	(5, 'Distance from Residence to Work')
	(8, 'Work load Average/day ')
	(17, 'Height')

##### RFE with Linear SVC for SMOTE #####
Features mantidas:
	(2, 'Day of the week')
	(5, 'Distance from Residence to Work')
	(7, 'Age')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(16, 'Weight')
	(17, 'Height')
	(18, 'Body mass index')
Features eliminadas:
	(0, 'Reason for absence')
	(1, 'Month of absence')
	(3, 'Seasons')
	(4, 'Transportation expense')
	(6, 'Service time')
	(8, 'Work load Average/day ')
	(15, 'Pet')

##### RFE with Logistic Re

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Features mantidas:
	(0, 'Reason for absence')
	(1, 'Month of absence')
	(3, 'Seasons')
	(7, 'Age')
	(9, 'Hit target')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(18, 'Body mass index')
Features eliminadas:
	(2, 'Day of the week')
	(4, 'Transportation expense')
	(5, 'Distance from Residence to Work')
	(6, 'Service time')
	(8, 'Work load Average/day ')
	(16, 'Weight')
	(17, 'Height')

##### RFE with Linear SVC for randomly downsampled data #####
Features mantidas:
	(0, 'Reason for absence')
	(3, 'Seasons')
	(7, 'Age')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(17, 'Height')
	(18, 'Body mass index')
Features eliminadas:
	(1, 'Month of absence')
	(2, 'Day of the week')
	(4, 'Transportation expense')
	(5, 'Distance from Residence to Work')
	(6, 'Service time')
	(8, 'Work load Average/day ')
	(9, 'Hit target')

##### 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

Features mantidas:
	(0, 'Reason for absence')
	(4, 'Transportation expense')
	(5, 'Distance from Residence to Work')
	(6, 'Service time')
	(7, 'Age')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(18, 'Body mass index')
Features eliminadas:
	(1, 'Month of absence')
	(2, 'Day of the week')
	(3, 'Seasons')
	(8, 'Work load Average/day ')
	(9, 'Hit target')
	(12, 'Son')
	(17, 'Height')

##### RFE with Logistic Regression for Cluster Centroids #####
Features mantidas:
	(3, 'Seasons')
	(4, 'Transportation expense')
	(6, 'Service time')
	(8, 'Work load Average/day ')
	(10, 'Disciplinary failure')
	(11, 'Education')
	(12, 'Son')
	(13, 'Social drinker')
	(14, 'Social smoker')
	(15, 'Pet')
	(16, 'Weight')
	(17, 'Height')
Features eliminadas:
	(0, 'Reason for absence')
	(1, 'Month of absence')
	(2, 'Day of the week')
	(5, 'Distance from Residence to Work')
	(7, 'Age')
	(9, 'Hit target')
	(18, 'Body mass index')

#####

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Training models

### Model types that will be trained

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Train Test Split

Target var is 'Absent'

In [21]:
y_train = AbsenteeismAtWork['Absent']

df_upsampled_kbest_selected = df_upsampled_kbest_selector.transform(df_upsampled_data)
smote_kbest_selected = smote_kbest_selector.transform(X_sm)
df_downsampled_kbest_selected = df_downsampled_kbest_selector.transform(df_downsampled_data)
tomek_kbest_selected = tomek_kbest_selector.transform(X_tl)
cluster_kbest_selected = cluster_kbest_selector.transform(X_cc)

df_upsampled_rfe_log_selected = df_upsampled_rfe_log_selector.transform(df_upsampled_data)
df_upsampled_rfe_svc_selected = df_upsampled_rfe_svc_selector.transform(df_upsampled_data)

smote_rfe_log_selected = smote_rfe_log_selector.transform(X_sm) 
smote_rfe_svc_selected = smote_rfe_svc_selector.transform(X_sm)

df_downsampled_rfe_log_selected = df_downsampled_rfe_log_selector.transform(df_downsampled_data)
df_downsampled_rfe_svc_selected = df_downsampled_rfe_svc_selector.transform(df_downsampled_data)

tomek_rfe_log_selected = tomek_rfe_log_selector.transform(X_tl)
tomek_rfe_svc_selected = tomek_rfe_svc_selector.transform(X_tl)

cluster_rfe_log_selected = cluster_rfe_log_selector.transform(X_cc)
cluster_rfe_svc_selected = cluster_rfe_svc_selector.transform(X_cc)



X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

# muita atencao a estes passos, podem surgir incoerencias entre teste/treino devido aos pre processamentos aplicados
def preprocessTestSet(X_test, selector, data):
    X_test_processed = discretize(X_test)
    X_test_processed = X_test_processed.drop(getDroppedFeatures(selector, data), axis=1)
    return X_test_processed;

X_test1 = preprocessTestSet(X_test, df_upsampled_kbest_selector, df_upsampled_data)
X_test2 = preprocessTestSet(X_test, smote_kbest_selector, X_sm)
X_test3 = preprocessTestSet(X_test, df_downsampled_kbest_selector, df_downsampled_data)
X_test4 = preprocessTestSet(X_test, tomek_kbest_selector, X_tl)
X_test5 = preprocessTestSet(X_test, cluster_kbest_selector, X_cc)
X_test6 = preprocessTestSet(X_test, df_upsampled_rfe_log_selector, df_upsampled_data)
X_test7 = preprocessTestSet(X_test, df_upsampled_rfe_svc_selector, df_upsampled_data)
X_test8 = preprocessTestSet(X_test, smote_rfe_log_selector, X_sm)
X_test9 = preprocessTestSet(X_test, smote_rfe_svc_selector, X_sm)
X_test10 = preprocessTestSet(X_test, df_downsampled_rfe_log_selector, df_downsampled_data)
X_test11 = preprocessTestSet(X_test, df_downsampled_rfe_svc_selector, df_downsampled_data)
X_test12 = preprocessTestSet(X_test, tomek_rfe_log_selector, X_tl)
X_test13 = preprocessTestSet(X_test, tomek_rfe_svc_selector, X_tl)
X_test14 = preprocessTestSet(X_test, cluster_rfe_log_selector, X_cc)
X_test15 = preprocessTestSet(X_test, cluster_rfe_svc_selector, X_cc)


### Creating and Training the Models

In [22]:
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=5)
knn3 = KNeighborsClassifier(n_neighbors=5)
knn4 = KNeighborsClassifier(n_neighbors=5)
knn5 = KNeighborsClassifier(n_neighbors=5)

rbfSVC1 = SVC()
rbfSVC2 = SVC()
rbfSVC3 = SVC()
rbfSVC4 = SVC()
rbfSVC5 = SVC()

log1 = LogisticRegression()
log2 = LogisticRegression()
log3 = LogisticRegression()
log4 = LogisticRegression()
log5 = LogisticRegression()

linearSVC1 = SVC(kernel='linear')
linearSVC2 = SVC(kernel='linear')
linearSVC3 = SVC(kernel='linear')
linearSVC4 = SVC(kernel='linear')
linearSVC5 = SVC(kernel='linear')

In [23]:
knn1.fit(df_upsampled_kbest_selected, df_upsampled_target)
knn2.fit(smote_kbest_selected, y_sm)
knn3.fit(df_downsampled_kbest_selected, df_downsampled_target)
knn4.fit(tomek_kbest_selected, y_tl)
knn5.fit(cluster_kbest_selected, y_cc)

rbfSVC1.fit(df_upsampled_kbest_selected, df_upsampled_target)
rbfSVC2.fit(smote_kbest_selected, y_sm)
rbfSVC3.fit(df_downsampled_kbest_selected, df_downsampled_target)
rbfSVC4.fit(tomek_kbest_selected, y_tl)
rbfSVC5.fit(cluster_kbest_selected, y_cc)

log1.fit(df_upsampled_rfe_log_selected, df_upsampled_target)
log2.fit(smote_rfe_log_selected, y_sm)
log3.fit(df_downsampled_rfe_log_selected, df_downsampled_target)
log4.fit(tomek_rfe_log_selected, y_tl)
log5.fit(cluster_rfe_log_selected, y_cc)

linearSVC1.fit(df_upsampled_rfe_svc_selected, df_upsampled_target)
linearSVC2.fit(smote_rfe_svc_selected, y_sm)
linearSVC3.fit(df_downsampled_rfe_svc_selected, df_downsampled_target)
linearSVC4.fit(tomek_rfe_svc_selected, y_tl)
linearSVC5.fit(cluster_rfe_svc_selected, y_cc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Models Prediction and Evaluation



In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [25]:
knn_kbest_up_pred = knn1.predict(X_test1)
knn_kbest_sm_pred = knn2.predict(X_test2)
knn_kbest_down_pred = knn3.predict(X_test3)
knn_kbest_tl_pred = knn4.predict(X_test4)
knn_kbest_cc_pred = knn5.predict(X_test5)

rbf_kbest_up_pred = rbfSVC1.predict(X_test1)
rbf_kbest_sm_pred = rbfSVC2.predict(X_test2)
rbf_kbest_down_pred = rbfSVC3.predict(X_test3)
rbf_kbest_tl_pred = rbfSVC4.predict(X_test4)
rbf_kbest_cc_pred = rbfSVC5.predict(X_test5)

log_rfe_up_pred = log1.predict(X_test6)
log_rfe_sm_pred = log2.predict(X_test8)
log_rfe_down_pred = log3.predict(X_test10)
log_rfe_tl_pred = log4.predict(X_test12)
log_rfe_cc_pred = log5.predict(X_test14)

linear_rfe_up_pred = linearSVC1.predict(X_test7)
linear_rfe_sm_pred = linearSVC2.predict(X_test9)
linear_rfe_down_pred = linearSVC3.predict(X_test11)
linear_rfe_tl_pred = linearSVC4.predict(X_test13)
linear_rfe_cc_pred = linearSVC5.predict(X_test15)

print("Accuracy knn_kbest_up_pred: " + str(accuracy_score(y_test, knn_kbest_up_pred)))
print("Accuracy knn_kbest_sm_pred: " + str(accuracy_score(y_test, knn_kbest_sm_pred)))
print("Accuracy knn_kbest_down_pred: " + str(accuracy_score(y_test, knn_kbest_down_pred)))
print("Accuracy knn_kbest_tl_pred: " + str(accuracy_score(y_test, knn_kbest_tl_pred)))
print("Accuracy knn_kbest_cc_pred: " + str(accuracy_score(y_test, knn_kbest_cc_pred)))

print("Accuracy rbf_kbest_up_pred: " + str(accuracy_score(y_test, rbf_kbest_up_pred)))
print("Accuracy rbf_kbest_sm_pred: " + str(accuracy_score(y_test, rbf_kbest_sm_pred)))
print("Accuracy rbf_kbest_down_pred: " + str(accuracy_score(y_test, rbf_kbest_down_pred)))
print("Accuracy rbf_kbest_tl_pred: " + str(accuracy_score(y_test, rbf_kbest_tl_pred)))
print("Accuracy rbf_kbest_cc_pred: " + str(accuracy_score(y_test, rbf_kbest_cc_pred)))

print("Accuracy log_rfe_up_pred: " + str(accuracy_score(y_test, log_rfe_up_pred)))
print("Accuracy log_rfe_sm_pred: " + str(accuracy_score(y_test, log_rfe_sm_pred)))
print("Accuracy log_rfe_down_pred: " + str(accuracy_score(y_test, log_rfe_down_pred)))
print("Accuracy log_rfe_tl_pred: " + str(accuracy_score(y_test, log_rfe_tl_pred)))
print("Accuracy log_rfe_cc_pred: " + str(accuracy_score(y_test, log_rfe_cc_pred)))

print("Accuracy linear_rfe_up_pred: " + str(accuracy_score(y_test, linear_rfe_up_pred)))
print("Accuracy linear_rfe_sm_pred: " + str(accuracy_score(y_test, linear_rfe_sm_pred)))
print("Accuracy linear_rfe_down_pred: " + str(accuracy_score(y_test, linear_rfe_down_pred)))
print("Accuracy linear_rfe_tl_pred: " + str(accuracy_score(y_test, linear_rfe_tl_pred)))
print("Accuracy linear_rfe_cc_pred: " + str(accuracy_score(y_test, linear_rfe_cc_pred)))

Accuracy knn_kbest_up_pred: 0.8166666666666667
Accuracy knn_kbest_sm_pred: 0.6291666666666667
Accuracy knn_kbest_down_pred: 0.7
Accuracy knn_kbest_tl_pred: 0.7541666666666667
Accuracy knn_kbest_cc_pred: 0.5291666666666667
Accuracy rbf_kbest_up_pred: 0.8166666666666667
Accuracy rbf_kbest_sm_pred: 0.7083333333333334
Accuracy rbf_kbest_down_pred: 0.18333333333333332
Accuracy rbf_kbest_tl_pred: 0.7916666666666666
Accuracy rbf_kbest_cc_pred: 0.5666666666666667
Accuracy log_rfe_up_pred: 0.20833333333333334
Accuracy log_rfe_sm_pred: 0.5666666666666667
Accuracy log_rfe_down_pred: 0.18333333333333332
Accuracy log_rfe_tl_pred: 0.7875
Accuracy log_rfe_cc_pred: 0.4666666666666667
Accuracy linear_rfe_up_pred: 0.8166666666666667
Accuracy linear_rfe_sm_pred: 0.6041666666666666
Accuracy linear_rfe_down_pred: 0.8166666666666667
Accuracy linear_rfe_tl_pred: 0.8083333333333333
Accuracy linear_rfe_cc_pred: 0.5083333333333333


In [26]:
print("\n\n")
print("##### knn_kbest_up_pred #####")
print(classification_report(y_test, knn_kbest_up_pred))
print("\n\n")
print("##### knn_kbest_sm_pred #####")
print(classification_report(y_test, knn_kbest_sm_pred))
print("\n\n")
print("##### knn_kbest_down_pred #####")
print(classification_report(y_test, knn_kbest_down_pred))
print("\n\n")
print("##### knn_kbest_tl_pred #####")
print(classification_report(y_test, knn_kbest_tl_pred))
print("\n\n")
print("##### knn_kbest_cc_pred  #####")
print(classification_report(y_test, knn_kbest_cc_pred))
print("\n\n")
print("##### rbf_kbest_up_pred #####")
print(classification_report(y_test, rbf_kbest_up_pred))
print("\n\n")
print("##### rbf_kbest_sm_pred #####")
print(classification_report(y_test, rbf_kbest_sm_pred))
print("\n\n")
print("#####  rbf_kbest_down_pred #####")
print(classification_report(y_test, rbf_kbest_down_pred))
print("\n\n")
print("#####  rbf_kbest_tl_pred #####")
print(classification_report(y_test, rbf_kbest_tl_pred))
print("\n\n")
print("#####  rbf_kbest_cc_pred #####")
print(classification_report(y_test, rbf_kbest_cc_pred))
print("\n\n")
print("#####  log_rfe_up_pred #####")
print(classification_report(y_test, log_rfe_up_pred))
print("\n\n")
print("#####  log_rfe_sm_pred #####")
print(classification_report(y_test, log_rfe_sm_pred))
print("\n\n")
print("#####  log_rfe_down_pred #####")
print(classification_report(y_test, log_rfe_down_pred))
print("\n\n")
print("#####  log_rfe_tl_pred #####")
print(classification_report(y_test, log_rfe_tl_pred))
print("\n\n")
print("#####  log_rfe_cc_pred #####")
print(classification_report(y_test, log_rfe_cc_pred))
print("\n\n")
print("#####  linear_rfe_up_pred #####")
print(classification_report(y_test, linear_rfe_up_pred))
print("\n\n")
print("#####  linear_rfe_sm_pred #####")
print(classification_report(y_test, linear_rfe_sm_pred))
print("\n\n")
print("#####  linear_rfe_down_pred #####")
print(classification_report(y_test, linear_rfe_down_pred))
print("\n\n")
print("#####  linear_rfe_tl_pred #####")
print(classification_report(y_test, linear_rfe_tl_pred))
print("\n\n")
print("#####  linear_rfe_cc_pred #####")
print(classification_report(y_test, linear_rfe_cc_pred))





##### knn_kbest_up_pred #####
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        44
           1       0.82      1.00      0.90       196

    accuracy                           0.82       240
   macro avg       0.41      0.50      0.45       240
weighted avg       0.67      0.82      0.73       240




##### knn_kbest_sm_pred #####
              precision    recall  f1-score   support

           0       0.24      0.48      0.32        44
           1       0.85      0.66      0.74       196

    accuracy                           0.63       240
   macro avg       0.55      0.57      0.53       240
weighted avg       0.74      0.63      0.67       240




##### knn_kbest_down_pred #####
              precision    recall  f1-score   support

           0       0.25      0.32      0.28        44
           1       0.84      0.79      0.81       196

    accuracy                           0.70       240
   macro avg       0.54    

  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

Abordagem com bagging através de random forests para superar problema de dataset desbalanceado

In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )

kbest_selector = createKBestSelector(data, target, 10)
selected = kbest_selector.transform(data)

X_test_scaled = scaler.transform(X_test)
X_test_selected = kbest_selector.transform(X_test_scaled)



clf = RandomForestClassifier()
clf.fit(selected, target)
 
pred = clf.predict(X_test_selected)
 
print( accuracy_score(y_test, pred) )
print(classification_report(y_test, pred))

0.7375
              precision    recall  f1-score   support

           0       0.17      0.11      0.14        44
           1       0.82      0.88      0.85       196

    accuracy                           0.74       240
   macro avg       0.49      0.50      0.49       240
weighted avg       0.70      0.74      0.72       240



### Cost-Sensitive Training

In [48]:
from sklearn.svm import SVC


svc = SVC(kernel='linear', 
            class_weight='balanced', 
            probability=True)

svc.fit(selected, target)

predSvc = svc.predict(X_test_selected)
 
print( accuracy_score(y_test, predSvc) )
print(classification_report(y_test, predSvc))

0.6083333333333333
              precision    recall  f1-score   support

           0       0.16      0.27      0.20        44
           1       0.81      0.68      0.74       196

    accuracy                           0.61       240
   macro avg       0.48      0.48      0.47       240
weighted avg       0.69      0.61      0.64       240



### Adaboosting

In [57]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

target = AbsenteeismAtWork['Absent']
data = AbsenteeismAtWork.drop('Absent', 1)

scaler = StandardScaler()
data = scaler.fit_transform( data )

X_test_scaled = scaler.transform(X_test)

classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)


classifier.fit(data, target)

predAda = classifier.predict(X_test_scaled)
 
print( accuracy_score(y_test, predAda) )
print(classification_report(y_test, predAda))

0.7375
              precision    recall  f1-score   support

           0       0.19      0.14      0.16        44
           1       0.82      0.87      0.84       196

    accuracy                           0.74       240
   macro avg       0.51      0.50      0.50       240
weighted avg       0.70      0.74      0.72       240

