In [91]:
# imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [92]:
hr_df=pd.read_csv(r'..\Cases\human-resources-analytics\HR_comma_sep.csv')
hr_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [93]:
X,y = hr_df.drop('left', axis=1),hr_df[['left']]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=25,stratify=y)


In [94]:
ohe  = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)),
                              ('passthrough', make_column_selector(dtype_exclude=object)),
                              verbose_feature_names_out=False)
ct = ct.set_output(transform='pandas')
X_trn_ohe = ct.fit_transform(X_train)
X_tst_ohe = ct.transform(X_test)

In [95]:
log_reg_l1=LogisticRegression(penalty='l2')
log_reg_none=LogisticRegression(penalty='none')
knn=KNeighborsClassifier()  #Create a pipeline for scaling and knn and then pass that pipeline in the voting algo
dtree=DecisionTreeClassifier()
dtree_max_depth=DecisionTreeClassifier(max_depth=3)


scaler=StandardScaler().set_output(transform='pandas')
pi=Pipeline([('scaler',scaler),('knn',knn)])


In [96]:
voting=VotingClassifier([('LogisticWithL1',log_reg_l1),('LogisticWithNone',log_reg_none),('knn',pi),('DecisionTree',dtree),('Decison Tree with max_depth',dtree_max_depth)],voting='soft')
voting.fit(X_trn_ohe,y_train)
y_pred_prob=voting.predict_proba(X_tst_ohe)
print(roc_auc_score(y_test,y_pred_prob[:,1]))

0.9916522895697227


In [97]:
voting.named_estimators_

{'LogisticWithL1': LogisticRegression(),
 'LogisticWithNone': LogisticRegression(penalty='none'),
 'knn': Pipeline(steps=[('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]),
 'DecisionTree': DecisionTreeClassifier(),
 'Decison Tree with max_depth': DecisionTreeClassifier(max_depth=3)}

## Individual predictions

In [98]:
y_pred_proba=voting.named_estimators_['LogisticWithL1'].predict_proba(X_tst_ohe)
roc_auc_score(y_test,y_pred_proba[:,1])

0.8161685786161466

In [99]:
y_pred_proba=voting.named_estimators_['LogisticWithNone'].predict_proba(X_tst_ohe)
roc_auc_score(y_test,y_pred_proba[:,1])

0.8185378696821776

In [100]:
y_pred_proba=voting.named_estimators_['knn'].predict_proba(X_tst_ohe)
roc_auc_score(y_test,y_pred_proba[:,1])

0.9760233631232232

In [101]:
y_pred_proba=voting.named_estimators_['DecisionTree'].predict_proba(X_tst_ohe)
roc_auc_score(y_test,y_pred_proba[:,1])

0.976910927411332

In [102]:
y_pred_proba=voting.named_estimators_['Decison Tree with max_depth'].predict_proba(X_tst_ohe)
roc_auc_score(y_test,y_pred_proba[:,1])

0.9717655620150285