In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [2]:
df = pd.read_csv('Datasets/HR_comma_sep.csv')
df.dropna()
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.10,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,1,0,support,low
14991,0.37,0.48,2,160,3,0,1,0,support,low
14992,0.37,0.53,2,143,3,0,1,0,support,low
14993,0.11,0.96,6,280,4,0,1,0,support,low


In [3]:
X, y = df.drop('left', axis = 1), df['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [4]:
trf1 = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(drop = 'first', sparse_output = False), [7,8])
],remainder = 'passthrough')

trf2 = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), slice(0,9))
],remainder = 'passthrough')

trf3 = ColumnTransformer(transformers=[
    ('pca', PCA(n_components=0.9), slice(0,9))
],remainder = 'passthrough')

preprocessing = Pipeline([
    ('encode',trf1),
    ('scale',trf2),
    ('pca',trf3)
])

pipe = Pipeline([
    ('preprocess', preprocessing),
    ('model', LogisticRegression(max_iter = 1000))
])

In [5]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('encode', ...), ('scale', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,transformers,"[('pca', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_components,0.9
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
y_pred = pipe.predict(X_test)
y_pred_proba = pipe.predict_proba(X_test)

In [7]:
accuracy_score(y_test, y_pred)

0.7883974216492554

In [8]:
log_loss(y_test, y_pred_proba)

0.4410601246493791

In [9]:
pca_obj = preprocessing.named_steps['pca'].named_transformers_['pca']
pca_obj.explained_variance_ratio_

array([0.1467176 , 0.13312661, 0.12381906, 0.11785273, 0.1174178 ,
       0.11699618, 0.1168788 , 0.11595681])

In [10]:
np.cumsum(pca_obj.explained_variance_ratio_)

array([0.1467176 , 0.27984421, 0.40366327, 0.521516  , 0.6389338 ,
       0.75592998, 0.87280879, 0.98876559])