In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, roc_auc_score, RocCurveDisplay, ConfusionMatrixDisplay
from factor_analyzer import FactorAnalyzer

In [3]:
from data.utils import add_experience, feature_classification, astype_category

In [4]:
# DataFrame 초기 세팅
hr_df = pd.read_csv(Path.cwd() / 'data' / 'HR-Employee-Attrition.csv')
hr_df = (hr_df
         .pipe(add_experience)
         .pipe(feature_classification, return_dataframe=True)
         .pipe(astype_category)
         .set_index('EmployeeNumber')
)
hr_df = (hr_df
         .assign(Attrition=lambda df: df['Attrition'].replace({'Yes':1, 'No':0}))
         .assign(OverTime=lambda df: df['OverTime'].replace({'Yes':1, 'No':0}))
)
hr_df.columns

Index(['MonthlyIncome', 'PercentSalaryHike', 'StockOptionLevel', 'Age',
       'Gender', 'MaritalStatus', 'Education', 'EducationField',
       'DistanceFromHome', 'NumCompaniesWorked', 'TotalWorkingYears',
       'Experienced', 'Department', 'BusinessTravel', 'JobLevel', 'JobRole',
       'JobInvolvement', 'OverTime', 'WorkLifeBalance',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'EnvironmentSatisfaction', 'JobSatisfaction', 'PerformanceRating',
       'RelationshipSatisfaction', 'Attrition'],
      dtype='object')

In [5]:
# 필요 내용 category 변경 확인
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470 entries, 1 to 2068
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   MonthlyIncome             1470 non-null   int64   
 1   PercentSalaryHike         1470 non-null   int64   
 2   StockOptionLevel          1470 non-null   int64   
 3   Age                       1470 non-null   int64   
 4   Gender                    1470 non-null   category
 5   MaritalStatus             1470 non-null   category
 6   Education                 1470 non-null   category
 7   EducationField            1470 non-null   category
 8   DistanceFromHome          1470 non-null   int64   
 9   NumCompaniesWorked        1470 non-null   int64   
 10  TotalWorkingYears         1470 non-null   int64   
 11  Experienced               1470 non-null   category
 12  Department                1470 non-null   category
 13  BusinessTravel            1470 non-null   catego

In [6]:
# categorical variables에 대한 one-hot encoding
hr_df_onehot = pd.get_dummies(hr_df, columns=['BusinessTravel', 'Department', 'Education', 'EducationField', 'EnvironmentSatisfaction',
                                              'Experienced', 'Gender', 'JobRole', 'MaritalStatus'])
hr_df_onehot

Unnamed: 0_level_0,MonthlyIncome,PercentSalaryHike,StockOptionLevel,Age,DistanceFromHome,NumCompaniesWorked,TotalWorkingYears,JobLevel,JobInvolvement,OverTime,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5993,11,0,41,1,8,8,2,3,1,...,0,0,0,0,0,1,0,0,0,1
2,5130,23,1,49,8,1,10,2,2,0,...,0,0,0,0,1,0,0,0,1,0
4,2090,15,0,37,2,6,7,1,2,1,...,1,0,0,0,0,0,0,0,0,1
5,2909,11,0,33,3,1,8,1,3,1,...,0,0,0,0,1,0,0,0,1,0
7,3468,12,1,27,2,9,6,1,3,0,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2061,2571,17,1,36,23,4,17,2,4,0,...,1,0,0,0,0,0,0,0,1,0
2062,9991,15,1,39,6,4,9,3,2,0,...,0,0,0,0,0,0,0,0,1,0
2064,6142,20,1,27,4,1,6,2,4,1,...,0,0,1,0,0,0,0,0,1,0
2065,5390,14,0,49,2,2,17,2,2,0,...,0,0,0,0,0,1,0,0,1,0


In [7]:
X = hr_df_onehot.drop('Attrition', axis=1)
y = hr_df_onehot['Attrition']
X.shape, y.shape

((1470, 57), (1470,))

In [8]:
# Standard Scaler 진행
scaler = StandardScaler().fit(X)
X_sc = scaler.transform(X)

In [9]:
# Factor analysis를 통한 Factor 확인
# Dimensionality Reduction 이후 Variance 설명량을 확인하여 적절한 factor 개수 추출
cumul_var_list = []
for i in range(3, 50):
    fa = FactorAnalyzer(n_factors=i, method='principal', rotation='varimax').fit(X_sc)
    cumul_var_list.append(fa.get_factor_variance()[2][i-1])
np.array(cumul_var_list)

array([0.20608721, 0.25222376, 0.2893682 , 0.32482034, 0.35815825,
       0.39003412, 0.41997676, 0.44876959, 0.47616802, 0.50256196,
       0.52766375, 0.55187591, 0.57644902, 0.59921788, 0.62147352,
       0.64264125, 0.66326924, 0.68361028, 0.70381067, 0.72368677,
       0.74291487, 0.76168947, 0.77995457, 0.79792148, 0.81523364,
       0.83227569, 0.84900917, 0.86559946, 0.88176429, 0.89768792,
       0.91272463, 0.92702985, 0.93927324, 0.94942109, 0.9585669 ,
       0.96732173, 0.97473727, 0.98094771, 0.98582011, 0.98971396,
       0.99353061, 0.99567085, 0.99708568, 0.99849442, 0.9993969 ,
       1.        , 1.02365641])

In [10]:
# Variance 설명량이 60%가 넘는 n_factors를 선택하여 Factor Loadings 확인
n_factors = 17
fa = FactorAnalyzer(n_factors=n_factors, method='principal', rotation='varimax').fit(X_sc)
fl_matrix = pd.DataFrame(fa.loadings_, columns=[f'Factor{i:02}' for i in range(1, n_factors + 1)], index=X.columns)
fl_matrix

Unnamed: 0,Factor01,Factor02,Factor03,Factor04,Factor05,Factor06,Factor07,Factor08,Factor09,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15,Factor16,Factor17
MonthlyIncome,0.853112,0.048425,0.342935,-0.002229,0.019648,-0.002123,-0.027512,-0.020166,0.051695,-0.007744,-0.057871,0.018668,-0.008477,0.072609,-0.116938,0.022519,-0.000653
PercentSalaryHike,-0.006718,-0.023198,-0.022272,-0.01922,0.003283,-0.007654,0.014406,0.934696,-0.018929,0.007265,0.001841,-0.011471,0.023622,0.01706,-0.003699,0.000159,-0.004415
StockOptionLevel,-0.009066,0.009834,0.010117,-0.012643,0.860137,0.001685,0.015524,0.003616,0.014218,0.024792,0.005524,0.151349,0.005423,0.011766,0.057477,-0.00439,0.011346
Age,0.691093,-0.012532,0.102819,0.007227,0.045501,0.031977,-0.002215,0.012086,0.13138,-0.003162,0.15918,-0.044305,0.012526,-0.050423,0.229476,0.019305,0.019462
DistanceFromHome,-0.050577,0.056664,0.03194,-0.026037,0.032177,-0.001086,0.071281,0.076844,0.00153,0.083482,-0.032197,-0.048989,0.03959,0.069161,0.290429,0.065416,0.031971
NumCompaniesWorked,0.281236,-0.00683,-0.189047,0.028011,0.006666,0.039516,-0.02542,-0.008121,0.67175,0.036599,0.095329,0.014038,-0.004337,-0.05009,0.149279,-0.026439,-0.003126
TotalWorkingYears,0.831715,-0.026073,0.382171,-0.010933,0.002203,0.025063,-0.009011,0.002695,0.062275,0.017246,0.048018,-0.005039,0.003347,0.000731,0.09745,0.008731,-0.00122
JobLevel,0.839603,0.106895,0.36547,-0.017791,0.018395,0.009218,-0.004196,-0.025625,0.038963,-0.015162,-0.056978,0.025959,0.006036,0.050759,-0.053452,0.050147,0.043419
JobInvolvement,-0.008169,-0.018621,-0.00858,0.008219,0.04216,-0.027039,-0.040759,-0.045503,-0.027035,0.020959,0.020991,-0.031616,-0.012586,0.088682,0.207266,0.045774,-0.050023
OverTime,0.004757,0.001693,-0.041954,-0.003306,-0.000694,0.051983,0.028335,-0.005196,-0.010675,0.006252,0.104782,0.038867,-0.044184,0.079945,0.021753,0.407639,-0.026585


In [11]:
# 각 Columns가 어떤 Factor에 속하는지, 그 Factor에서의 Factor Loadings는 어떻게 되는지 확인
fl_df = (pd.concat([fl_matrix.idxmax(axis=1), fl_matrix.max(axis=1)], axis=1)
         .rename({0:'Idxmax', 1:'Max'}, axis=1)
         .sort_values(by='Idxmax', ascending=True)
)
fl_df

Unnamed: 0,Idxmax,Max
MonthlyIncome,Factor01,0.853112
Age,Factor01,0.691093
JobRole_Manager,Factor01,0.533891
TotalWorkingYears,Factor01,0.831715
JobLevel,Factor01,0.839603
JobRole_Research Director,Factor01,0.425075
BusinessTravel_Travel_Rarely,Factor01,0.033151
Experienced_ExpEmp,Factor01,0.691611
EducationField_Marketing,Factor02,0.682078
Department_Sales,Factor02,0.956303


In [12]:
# Max Factor Loadings가 0.5 이하인 Features를 택하여 해당 Features를 제거
not_sig_feats_in_fa = (fl_df
                       .query("Max < 0.5")
                       .index
)
not_sig_feats_in_fa = not_sig_feats_in_fa.to_list()
X_remove = X.drop(not_sig_feats_in_fa, axis=1)

In [13]:
# Standard Scaler 진행
scaler = StandardScaler().fit(X_remove)
X_remove_sc = scaler.transform(X_remove)

In [14]:
# 기존 설정한 n_factors를 바탕으로 다시 fl_matrix 확인
fa = FactorAnalyzer(n_factors=n_factors, method='principal', rotation='varimax').fit(X_remove_sc)
fl_matrix_changed = pd.DataFrame(fa.loadings_, columns=[f'Factor{i:02}' for i in range(1, n_factors + 1)], index=X_remove.columns)
fl_matrix_changed

Unnamed: 0,Factor01,Factor02,Factor03,Factor04,Factor05,Factor06,Factor07,Factor08,Factor09,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15,Factor16,Factor17
MonthlyIncome,0.361904,0.256841,-0.015287,0.075421,-0.021322,0.009891,-0.007451,-0.034343,0.129877,0.822254,-0.008487,0.00023,0.004616,-0.021128,0.003552,-0.04205,-0.068777
PercentSalaryHike,-0.017103,0.010636,-0.017437,-0.012393,0.94118,-0.004728,0.011227,-0.01499,-0.002276,-0.014753,-0.012874,-0.007522,0.014862,-0.001212,0.020002,-0.012598,-0.004296
StockOptionLevel,0.009653,0.016679,0.003225,0.001988,0.009875,0.869917,0.029248,0.003112,0.018718,-0.02621,-0.025848,0.014057,0.028444,0.013546,0.029104,0.041065,0.31016
Age,0.315704,0.517518,0.007377,-0.092506,0.008373,0.021194,-0.015014,0.088092,0.427965,0.300938,0.00852,0.016594,0.016778,0.174162,0.006127,0.1739,-0.007241
NumCompaniesWorked,-0.111781,0.027152,0.018735,0.008639,-0.007999,0.024469,-0.008236,-0.024186,0.948287,0.080527,-0.022865,0.020899,0.008027,0.029001,-0.014155,-0.034657,0.007083
TotalWorkingYears,0.554766,0.467532,-0.011354,-0.042644,-0.005362,0.005346,-0.019713,-0.01417,0.272611,0.536259,0.011806,0.018125,0.029386,0.029647,-0.009156,0.029919,-0.049697
JobLevel,0.405048,0.280463,-0.022944,0.147021,-0.026964,0.01774,-0.007263,-0.016228,0.139769,0.770131,0.001807,0.006871,-0.004295,-0.029227,0.005328,-0.042714,-0.090091
YearsAtCompany,0.881055,-0.098556,-0.000201,-0.009117,-0.019492,0.001181,-0.00499,0.003654,-0.050432,0.310398,0.013077,-0.000431,0.02549,0.038915,0.011044,0.037498,-9.1e-05
YearsInCurrentRole,0.874424,-0.062038,-0.02842,0.04769,0.020868,0.029166,0.019685,-0.039013,-0.029937,0.084442,-0.016616,0.023787,-0.038287,-0.01786,0.02267,-0.026252,0.012989
YearsSinceLastPromotion,0.684778,-0.030021,-0.019522,0.028469,-0.008251,0.004534,-0.004697,0.001473,0.024541,0.150156,0.034227,-0.012746,0.030083,0.002626,-0.012551,-0.03587,0.517796


In [15]:
# 각 Columns가 어떤 Factor에 속하는지, 그 Factor에서의 Factor Loadings는 어떻게 되는지 확인
fl_df_changed = (pd.concat([fl_matrix_changed.idxmax(axis=1), fl_matrix_changed.max(axis=1)], axis=1)
                 .rename({0:'Idxmax', 1:'Max'}, axis=1)
                 .sort_values(by='Idxmax', ascending=True)
)
fl_df_changed

Unnamed: 0,Idxmax,Max
TotalWorkingYears,Factor01,0.554766
YearsAtCompany,Factor01,0.881055
YearsInCurrentRole,Factor01,0.874424
YearsSinceLastPromotion,Factor01,0.684778
YearsWithCurrManager,Factor01,0.876315
Age,Factor02,0.517518
Experienced_ExpEmp,Factor02,0.878248
JobRole_Human Resources,Factor03,0.923553
EducationField_Human Resources,Factor03,0.795523
Department_Human Resources,Factor03,0.956566
