In [10]:
import pandas as pd
from sklearn import svm

path = 'datasets/labor.csv'
dataset_D = pd.read_csv(path)

dataset_D.head()

Unnamed: 0,dur,wage1,wage2,wage3,cola,hours,pension,stby_pay,shift_diff,educ_allw,holidays,vacation,vacation_quant,lngtrm_disabil,dntl_ins,dntl_ins_quant,bereavement,empl_hplan,event
0,2,3.0,7.0,,,38.0,,12.0,25.0,True,11.0,ba,0.0,True,half,1.0,True,,1
1,2,4.0,5.0,,tcf,35.0,,13.0,5.0,,15.0,gnr,2.0,,,,,,1
2,2,4.5,5.8,,,35.0,ret_allw,,,True,11.0,ba,0.0,,full,2.0,,full,1
3,2,7.0,5.3,,,,,,,,11.0,,,True,full,2.0,,,1
4,2,4.3,4.4,,,38.0,,,4.0,,12.0,gnr,2.0,,full,2.0,,full,1


In [11]:

selected_feature_columns = ['dur', 'wage1', 'hours', 'holidays', 'vacation_quant', 'dntl_ins_quant']
selected_feature = dataset_D.loc[:, selected_feature_columns]
selected_feature.head()

Unnamed: 0,dur,wage1,hours,holidays,vacation_quant,dntl_ins_quant
0,2,3.0,38.0,11.0,0.0,1.0
1,2,4.0,35.0,15.0,2.0,
2,2,4.5,35.0,11.0,0.0,2.0
3,2,7.0,,11.0,,2.0
4,2,4.3,38.0,12.0,2.0,2.0


In [12]:
import numpy as np
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
selected_feature = imputer.fit_transform(selected_feature)
selected_feature

array([[ 2. ,  3. , 38. , 11. ,  0. ,  1. ],
       [ 2. ,  4. , 35. , 15. ,  2. ,  1. ],
       [ 2. ,  4.5, 35. , 11. ,  0. ,  2. ],
       [ 2. ,  7. , 38. , 11. ,  0. ,  2. ],
       [ 2. ,  4.3, 38. , 12. ,  2. ,  2. ],
       [ 3. ,  3.7, 38. , 10. ,  0. ,  1. ],
       [ 3. ,  4. , 38. , 12. ,  2. ,  0. ],
       [ 2. ,  4.5, 38. , 10. ,  0. ,  0. ],
       [ 1. ,  2.8, 35. , 12. ,  0. ,  1. ],
       [ 1. ,  5. , 40. , 11. ,  1. ,  1. ],
       [ 3. ,  6.9, 40. , 12. ,  0. ,  1. ],
       [ 2. ,  6.4, 38. , 15. ,  0. ,  2. ],
       [ 2. ,  2. , 35. , 12. ,  1. ,  1. ],
       [ 1. ,  6. , 38. ,  9. ,  2. ,  1. ],
       [ 3. ,  6. , 35. ,  9. ,  2. ,  2. ],
       [ 2. ,  4.5, 40. , 10. ,  2. ,  1. ],
       [ 3. ,  5. , 40. , 12. ,  1. ,  1. ],
       [ 2. ,  4.6, 38. , 10. ,  0. ,  1. ],
       [ 2. ,  2. , 38. , 12. ,  2. ,  0. ],
       [ 1. ,  4. , 38. , 11. ,  1. ,  0. ],
       [ 3. ,  3. , 40. , 10. ,  0. ,  1. ],
       [ 2. ,  2.5, 40. , 11. ,  0. ,  1. ],
       [ 2

In [13]:
selected_target = dataset_D.event
selected_target.head()

0    1
1    1
2    1
3    1
4    1
Name: event, dtype: int64

In [14]:
# Source: https://www.kaggle.com/code/pbizil/machine-learning-models-and-friedman-test/notebook
def ranking_model(results_aggregate):
    ranking = pd.DataFrame(columns=results_aggregate.columns)
    for i in range(results_aggregate.shape[0]):
        ranking.loc[i, results_aggregate.iloc[i].rank(ascending=False).index]=results_aggregate.iloc[i].rank(ascending=False)
    return ranking

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

imbalanced_results = pd.DataFrame()

from sklearn import tree 
dt_classifer = make_pipeline(preprocessing.MinMaxScaler(),tree.DecisionTreeClassifier())
dt_scores = cross_val_score(dt_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["dt_scores"] = dt_scores 

from sklearn.neighbors import KNeighborsClassifier
knn_classifer = make_pipeline(preprocessing.MinMaxScaler(), KNeighborsClassifier())
knn_scores = cross_val_score(knn_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["knn_scores"] = knn_scores 

from sklearn.ensemble import RandomForestClassifier
rf_classifer = make_pipeline(preprocessing.MinMaxScaler(), RandomForestClassifier())
rf_scores = cross_val_score(rf_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["rf_scores"] = rf_scores 

from sklearn import svm
svm_classifer = make_pipeline(preprocessing.MinMaxScaler(), svm.SVC())
svm_scores = cross_val_score(svm_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["svm_scores"] = svm_scores 

from sklearn.neural_network import MLPClassifier
mlp_classifer = make_pipeline(preprocessing.MinMaxScaler(), MLPClassifier(max_iter=600))
mlp_scores = cross_val_score(mlp_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["mlp_scores"] = mlp_scores 

from sklearn.ensemble import GradientBoostingClassifier
gb_classifer = make_pipeline(preprocessing.MinMaxScaler(), GradientBoostingClassifier())
gb_scores = cross_val_score(gb_classifer, selected_feature, selected_target, cv=10)
imbalanced_results["gb_scores"] = gb_scores 

ranks = ranking_model(imbalanced_results)
print(ranks)




  dt_scores knn_scores rf_scores svm_scores mlp_scores gb_scores
0       4.0        4.0       4.0        4.0        1.0       4.0
1       4.0        1.0       4.0        4.0        4.0       4.0
2       5.5        5.5       2.5        2.5        2.5       2.5
3       5.5        1.0       5.5        2.0        3.5       3.5
4       6.0        4.0       4.0        4.0        1.5       1.5
5       2.0        5.0       2.0        5.0        2.0       5.0
6       5.0        1.5       5.0        3.0        1.5       5.0
7       5.0        2.0       2.0        2.0        5.0       5.0
8       3.5        3.5       3.5        3.5        3.5       3.5
9       4.0        1.0       4.0        4.0        4.0       4.0


In [16]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn import metrics

db1_kfold = dataset_D.copy(deep=True)

selected_feature = db1_kfold.loc[:, selected_feature_columns]
selected_target = db1_kfold.event

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
selected_feature = imputer.fit_transform(selected_feature)

preprocessing.MinMaxScaler()
selected_feature = preprocessing.MinMaxScaler().fit_transform(selected_feature)

from sklearn import tree 
dt_classifer_os = tree.DecisionTreeClassifier()

from sklearn.neighbors import KNeighborsClassifier
knn_classifer_os = KNeighborsClassifier()

from sklearn.ensemble import RandomForestClassifier
rf_classifer_os =  RandomForestClassifier()

from sklearn import svm
svm_classifer_os = svm.SVC()

from sklearn.neural_network import MLPClassifier
mlp_classifer_os = MLPClassifier(max_iter=600)

from sklearn.ensemble import GradientBoostingClassifier
gb_classifer_os = GradientBoostingClassifier()

k_fold_cv_os = KFold(n_splits=10)
smote_os = SMOTE()

dt_scores_os = []
knn_scores_os = []
rf_scores_os = []
svm_scores_os = []
mlp_scores_os = []
gb_scores_os = []
results_os = pd.DataFrame()

for train_index, test_index in k_fold_cv_os.split(selected_feature, selected_target):
    train_selected_feature_fold, train_selected_target_fold = selected_feature[train_index], selected_target[train_index]
    test_selected_feature_fold, test_selected_target_fold = selected_feature[test_index], selected_target[test_index]

    train_selected_feature_os, train_selected_target_os = smote_os.fit_resample(train_selected_feature_fold, train_selected_target_fold)

    dt_model_os = dt_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    knn_model_os = knn_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    rf_model_os = rf_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    svm_model_os = svm_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    mlp_model_os = mlp_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    gb_model_os = gb_classifer_os.fit(train_selected_feature_os, train_selected_target_os)
    
    dt_scores_os.append(metrics.recall_score(test_selected_target_fold, dt_model_os.predict(test_selected_feature_fold)))
    knn_scores_os.append(metrics.recall_score(test_selected_target_fold, knn_model_os.predict(test_selected_feature_fold)))
    rf_scores_os.append(metrics.recall_score(test_selected_target_fold, rf_model_os.predict(test_selected_feature_fold)))
    svm_scores_os.append(metrics.recall_score(test_selected_target_fold, svm_model_os.predict(test_selected_feature_fold)))
    mlp_scores_os.append(metrics.recall_score(test_selected_target_fold, mlp_model_os.predict(test_selected_feature_fold)))
    gb_scores_os.append(metrics.recall_score(test_selected_target_fold, gb_model_os.predict(test_selected_feature_fold)))

results_os["dt_scores_os"] = dt_scores_os
results_os["knn_scores_os"] = knn_scores_os
results_os["rf_scores_os"] = rf_scores_os
results_os["svm_scores_os"] = svm_scores_os
results_os["mlp_scores_os"] = mlp_scores_os
results_os["gb_scores_os"] = gb_scores_os

ranks_os = ranking_model(results_os)
print(ranks_os)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  dt_scores_os knn_scores_os rf_scores_os svm_scores_os mlp_scores_os  \
0          3.5           3.5          3.5           3.5           3.5   
1          6.0           3.5          3.5           3.5           1.0   
2          2.0           2.0          5.0           2.0           5.0   
3          4.5           4.5          1.5           4.5           1.5   
4          4.0           4.0          4.0           4.0           1.0   
5          3.5           3.5          3.5           3.5           3.5   
6          3.5           3.5          3.5           3.5           3.5   
7          3.5           3.5          3.5           3.5           3.5   
8          3.5           3.5          3.5           3.5           3.5   
9          3.5           3.5          3.5           3.5           3.5   

  gb_scores_os  
0          3.5  
1          3.5  
2          5.0  
3          4.5  
4          4.0  
5          3.5  
6          3.5  
7          3.5  
8          3.5  
9          3.5  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.model_selection import KFold
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids
from sklearn import metrics

db2_kfold = dataset_D.copy(deep=True)

selected_feature = db2_kfold.loc[:, selected_feature_columns]
selected_target = db2_kfold.event

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
selected_feature = imputer.fit_transform(selected_feature)

preprocessing.MinMaxScaler()
selected_feature = preprocessing.MinMaxScaler().fit_transform(selected_feature)

from sklearn import tree 
dt_classifer_us = tree.DecisionTreeClassifier()

from sklearn.neighbors import KNeighborsClassifier
knn_classifer_us = KNeighborsClassifier()

from sklearn.ensemble import RandomForestClassifier
rf_classifer_us =  RandomForestClassifier()

from sklearn import svm
svm_classifer_us = svm.SVC()

from sklearn.neural_network import MLPClassifier
mlp_classifer_us = MLPClassifier(max_iter=600)

from sklearn.ensemble import GradientBoostingClassifier
gb_classifer_us = GradientBoostingClassifier()

k_fold_cv_us = KFold(n_splits=10)
enn_us = ClusterCentroids()  #EditedNearestNeighbours()

dt_scores_us = []
knn_scores_us = []
rf_scores_us = []
svm_scores_us = []
mlp_scores_us = []
gb_scores_us = []
results_us = pd.DataFrame()

for train_index, test_index in k_fold_cv_us.split(selected_feature, selected_target):
    train_selected_feature_fold, train_selected_target_fold = selected_feature[train_index], selected_target[train_index]
    test_selected_feature_fold, test_selected_target_fold = selected_feature[test_index], selected_target[test_index]

    train_selected_feature_us, train_selected_target_us = enn_us.fit_resample(train_selected_feature_fold, train_selected_target_fold)

    dt_model_us = dt_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    knn_model_us = knn_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    rf_model_us = rf_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    svm_model_us = svm_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    mlp_model_us = mlp_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    gb_model_us = gb_classifer_us.fit(train_selected_feature_us, train_selected_target_us)
    
    dt_scores_us.append(metrics.recall_score(test_selected_target_fold, dt_model_us.predict(test_selected_feature_fold)))
    knn_scores_us.append(metrics.recall_score(test_selected_target_fold, knn_model_us.predict(test_selected_feature_fold)))
    rf_scores_us.append(metrics.recall_score(test_selected_target_fold, rf_model_us.predict(test_selected_feature_fold)))
    svm_scores_us.append(metrics.recall_score(test_selected_target_fold, svm_model_us.predict(test_selected_feature_fold)))
    mlp_scores_us.append(metrics.recall_score(test_selected_target_fold, mlp_model_us.predict(test_selected_feature_fold)))
    gb_scores_us.append(metrics.recall_score(test_selected_target_fold, gb_model_us.predict(test_selected_feature_fold)))

results_us["dt_scores_us"] = dt_scores_us
results_us["knn_scores_us"] = knn_scores_us
results_us["rf_scores_us"] = rf_scores_us
results_us["svm_scores_us"] = svm_scores_us
results_us["mlp_scores_us"] = mlp_scores_us
results_us["gb_scores_us"] = gb_scores_us

ranks_us = ranking_model(results_us)
print(ranks_us)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  dt_scores_us knn_scores_us rf_scores_us svm_scores_us mlp_scores_us  \
0          3.5           3.5          3.5           3.5           3.5   
1          1.5           4.5          4.5           4.5           4.5   
2          4.0           1.0          4.0           4.0           4.0   
3          4.0           6.0          1.5           4.0           4.0   
4          5.5           2.5          4.0           2.5           1.0   
5          5.0           2.0          5.0           2.0           2.0   
6          3.5           3.5          3.5           3.5           3.5   
7          3.5           3.5          3.5           3.5           3.5   
8          3.5           3.5          3.5           3.5           3.5   
9          3.5           3.5          3.5           3.5           3.5   

  gb_scores_us  
0          3.5  
1          1.5  
2          4.0  
3          1.5  
4          5.5  
5          5.0  
6          3.5  
7          3.5  
8          3.5  
9          3.5  


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(imbalanced_results["dt_scores"].mean(), results_os["dt_scores_os"].mean(), results_us["dt_scores_us"].mean())
print(imbalanced_results["knn_scores"].mean(), results_os["knn_scores_os"].mean(), results_us["knn_scores_us"].mean())
print(imbalanced_results["rf_scores"].mean(), results_os["rf_scores_os"].mean(), results_us["rf_scores_us"].mean())
print(imbalanced_results["svm_scores"].mean(), results_os["svm_scores_os"].mean(), results_us["svm_scores_us"].mean())
print(imbalanced_results["mlp_scores"].mean(), results_os["mlp_scores_os"].mean(), results_us["mlp_scores_us"].mean())
print(imbalanced_results["gb_scores"].mean(), results_os["gb_scores_os"].mean(), results_us["gb_scores_us"].mean())

0.4833333333333333 0.3666666666666666 0.3
0.7833333333333333 0.4 0.3666666666666666
0.6 0.4 0.3333333333333333
0.6666666666666666 0.4 0.3666666666666666
0.7166666666666666 0.4666666666666666 0.39999999999999997
0.5833333333333333 0.3666666666666666 0.33333333333333337
