In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# load in the pre-cleaned HR dataset
df = pd.read_csv('HR_data_Clean.csv')

# examine the data
df.head()

Unnamed: 0.1,Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,...,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,PaytoEffortRatio,SatisfactionRatio,DevelopmentRatio,FlightRatio,AgeGroup
0,0,1,51,No,Travel_Rarely,Sales,3.728226,2,Life Sciences,Female,...,3.0,4.0,2.0,3,3,106.826196,3.493333,7.0,0.333333,48-53
1,1,2,31,Yes,Travel_Frequently,Research & Development,6.21371,1,Life Sciences,Female,...,3.0,2.0,4.0,2,4,32.039517,3.280494,2.5,0.0,30-35
2,2,3,32,No,Travel_Frequently,Research & Development,10.563307,4,Other,Male,...,2.0,2.0,1.0,3,3,158.671791,0.586847,9.0,0.1,30-35
3,3,4,38,No,Non-Travel,Research & Development,1.242742,5,Life Sciences,Male,...,4.0,4.0,3.0,2,3,68.580955,7.070037,1.375,0.057692,36-41
4,4,5,32,No,Travel_Rarely,Research & Development,6.21371,1,Medical,Male,...,4.0,1.0,3.0,3,3,16.63576,1.523314,5.0,0.111111,30-35


In [3]:
# detect any remaining null values
print(df.isnull().sum())

Unnamed: 0                  0
EmployeeID                  0
Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
Salary                      0
NumCompaniesWorked          0
PercentSalaryHike           0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
MeanHrsWorked               0
MaxHrsWorked                0
SumHrsWorked                0
StdHrsWorked                0
SickDays                    0
EnvironmentSatisfaction    25
JobSatisfaction            20
WorkLifeBalance            37
JobInvolvement              0
PerformanceRating           0
PaytoEffortRatio            0
Satisfacti

In [4]:
# clean null values, replacing with median (categorical)
nan_columns = ['EnvironmentSatisfaction','JobSatisfaction','WorkLifeBalance','SatisfactionRatio','FlightRatio']

for col in nan_columns:
    df[col] = df[col].fillna(df[col].median())
    
# drop rows with fewer than 20 null values
df.dropna(how='any', axis='rows', inplace=True)

# ensure null values have been filled
print(df.isnull().sum())

Unnamed: 0                 0
EmployeeID                 0
Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
Salary                     0
NumCompaniesWorked         0
PercentSalaryHike          0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
MeanHrsWorked              0
MaxHrsWorked               0
SumHrsWorked               0
StdHrsWorked               0
SickDays                   0
EnvironmentSatisfaction    0
JobSatisfaction            0
WorkLifeBalance            0
JobInvolvement             0
PerformanceRating          0
PaytoEffortRatio           0
SatisfactionRatio          0
DevelopmentRat

In [5]:
# drop redundant columns
df.drop(columns=['Unnamed: 0'], inplace=True)

# select ordinal categorical features to be encoded rather than scaled
ordinal = ['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement', 'PerformanceRating']

# change dtype to categorical for ordinal features
df[ordinal] = df[ordinal].astype('object')

In [6]:
# get numerical columns and assign to a dataframe
numerical = df.select_dtypes(include=['float64', 'int64'])

# get categorical columns and assign to a DataFrame for encoding
categorical  = df.select_dtypes(include=['object'])

# Drop the target variable 'Attrition' from the DataFrame to be encoded
categorical.drop(columns='Attrition', inplace=True)

# manually map 'Attrition' to a binary encoding
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# get categorical dummies and assign to a DataFrame for scaling
df_dummies = pd.get_dummies(categorical)
df_dummies = pd.concat([numerical, df_dummies], axis=1)

# ensure all dummy datatypes are numerical
print(df_dummies.dtypes)

EmployeeID            int64
Age                   int64
DistanceFromHome    float64
Education             int64
JobLevel              int64
                     ...   
AgeGroup_30-35        uint8
AgeGroup_36-41        uint8
AgeGroup_42-47        uint8
AgeGroup_48-53        uint8
AgeGroup_54-59        uint8
Length: 74, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [7]:
# assign feature array and label array
X = df_dummies.values
y = df['Attrition'].values

# split X and y into testing and training datasets, stratify the split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# make a StandardScaler object 
scaler = StandardScaler() 

# fit numerical test and train data to the scaler object independently
X_train_scaled = scaler.fit_transform(X_train[:,0:24])
X_test_scaled = scaler.transform(X_test[:,0:24]) 

# concatenate the scaled numerical data to the encoded categorical data for both test and train sets
X_train = np.concatenate([X_train_scaled, X_train[:,24:]], axis=1)
X_test = np.concatenate([X_test_scaled, X_test[:,24:]], axis=1)

# examine the results
print(pd.DataFrame(X_train))

            0         1         2         3         4         5         6   \
0    -1.406041 -0.641489 -1.004548 -0.914427 -0.036889 -0.789683 -0.672768   
1     1.354494 -0.530782  2.076565  0.064378 -0.036889  0.479244  1.324103   
2     1.233751 -0.309369 -0.881303  0.064378 -0.036889 -0.784422 -0.672768   
3    -0.293304 -0.420076 -0.141836  1.043184  0.879632 -0.861652 -0.672768   
4    -0.020249 -1.305727 -0.388325 -0.914427  1.796153 -0.947088  0.125981   
...        ...       ...       ...       ...       ...       ...       ...   
3051 -1.406830  1.240521 -1.004548  1.043184 -0.953410 -0.919311  0.525355   
3052 -0.586877 -0.309369  2.076565  0.064378 -0.953410  2.628845  1.324103   
3053 -1.044598 -1.637847  0.967365 -0.914427  0.879632  0.664849 -0.672768   
3054 -0.948319 -0.752195 -0.881303 -1.893232 -0.953410  0.820571  0.125981   
3055 -1.192963  0.244163  2.076565 -1.893232 -0.036889 -0.652058 -1.072142   

            7         8         9   ...   64   65   66   67   6

In [8]:
# export train & test arrays for later use
pd.DataFrame(X_train).to_csv("X_train.csv")
pd.DataFrame(X_test ).to_csv("X_test.csv" )
pd.DataFrame(y_train).to_csv("y_train.csv")
pd.DataFrame(y_test ).to_csv("y_test.csv" )

# export feature labels
pd.DataFrame(df_dummies.columns).to_csv("features.csv" )