In [1]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('watson_healthcare_modified.csv')

In [3]:
data

Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1313919,41,No,Travel_Rarely,1102,Cardiology,1,2,Life Sciences,1,...,1,80,0,8,0,1,6,4,0,5
1,1200302,49,No,Travel_Frequently,279,Maternity,8,1,Life Sciences,1,...,4,80,1,10,3,3,10,7,1,7
2,1060315,37,Yes,Travel_Rarely,1373,Maternity,2,2,Other,1,...,2,80,0,7,3,3,0,0,0,0
3,1272912,33,No,Travel_Frequently,1392,Maternity,3,4,Life Sciences,1,...,3,80,0,8,3,3,8,7,3,0
4,1414939,27,No,Travel_Rarely,591,Maternity,2,1,Medical,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1671,1117656,26,Yes,Travel_Rarely,471,Neurology,24,3,Technical Degree,1,...,2,80,0,1,3,1,1,0,0,0
1672,1152327,46,No,Travel_Rarely,1125,Cardiology,10,3,Marketing,1,...,3,80,1,15,3,3,3,2,1,2
1673,1812428,20,No,Travel_Rarely,959,Maternity,1,3,Life Sciences,1,...,4,80,0,1,0,4,1,0,0,0
1674,1812429,39,No,Travel_Rarely,466,Neurology,1,1,Life Sciences,1,...,3,80,1,21,3,3,21,6,11,8


In [4]:
y = data['Attrition']

In [5]:
X = data.drop('Attrition', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [7]:
X_train.head()

Unnamed: 0,EmployeeID,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1494,1469038,42,Travel_Rarely,933,Maternity,29,3,Life Sciences,1,2,...,4,80,1,10,3,2,9,8,7,8
450,1882668,42,Travel_Rarely,319,Maternity,24,3,Medical,1,4,...,4,80,1,10,5,2,10,9,5,8
252,1322829,29,Travel_Rarely,665,Maternity,15,3,Life Sciences,1,3,...,1,80,0,6,1,3,6,5,1,5
507,1179356,33,Travel_Frequently,1296,Maternity,6,3,Life Sciences,1,3,...,3,80,1,15,2,1,13,11,4,7
1139,1135463,45,Travel_Rarely,1015,Maternity,5,5,Medical,1,3,...,1,80,0,10,3,3,10,7,1,4


In [8]:
def normalise_data(X_train, X_test, method_flag):
    
    X_train_inp_nr = X_train.select_dtypes(exclude=['object'])
    X_test_inp_nr = X_test.select_dtypes(exclude=['object'])
    
    if method_flag == '1':
        sc = StandardScaler()

    elif method_flag == '2':
        sc = MinMaxScaler()

    X_train[X_train_inp_nr.columns] = sc.fit_transform(X_train_inp_nr)
    X_test[X_test_inp_nr.columns] = sc.transform(X_test_inp_nr)
    return X_train, X_test

In [27]:
def simple_data_impute(X_train, X_test, strategy_flag):
    
    if strategy_flag == '1':
        imputer = SimpleImputer(strategy = 'mean')
    elif strategy_flag == '2':
        imputer = SimpleImputer(strategy = 'median')
    else:
        imputer = SimpleImputer(strategy= 'mode')
        
    X_train_num = X_train.select_dtypes(exclude=['object'])
    X_test_num = X_test.select_dtypes(exclude=['object'])
    
    num_col_names = list(X_train_num.columns)
    X_train_imp = pd.DataFrame(imputer.fit_transform(X_train_num), columns = num_col_names)
    X_test_imp = pd.DataFrame(imputer.transform(X_test_num), columns = num_col_names)
    
    imputer = SimpleImputer(strategy='most_frequent')
    
    X_train_obj = X_train.select_dtypes(include=['object'])
    X_test_obj = X_test.select_dtypes(include=['object'])
    
    obj_col_names = list(X_train_obj.columns)
    X_train_obj_imp = pd.DataFrame(imputer.fit_transform(X_train_obj), columns = obj_col_names)
    X_test_obj_imp = pd.DataFrame(imputer.transform(X_test_obj), columns = obj_col_names)
    
    
    return X_train_imp, X_test_imp, X_train_obj_imp, X_test_obj_imp

In [28]:
X_train_imputed, X_test_imputed, X_train_obj_imp, X_test_obj_imp = simple_data_impute(X_train, X_test, '1')

In [29]:
X_train_imputed = pd.concat([X_train_imputed, X_train_obj_imp], axis=1, join='inner')

Unnamed: 0,EmployeeID,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,1469038.0,42.0,933.0,29.0,3.0,1.0,2.0,98.0,3.0,2.0,...,7.0,8.0,Travel_Rarely,Maternity,Life Sciences,Male,Therapist,Married,Y,No
1,1882668.0,42.0,319.0,24.0,3.0,1.0,4.0,56.0,3.0,3.0,...,5.0,8.0,Travel_Rarely,Maternity,Medical,Male,Therapist,Married,Y,Yes
2,1322829.0,29.0,665.0,15.0,3.0,1.0,3.0,60.0,3.0,1.0,...,1.0,5.0,Travel_Rarely,Maternity,Life Sciences,Male,Other,Single,Y,No
3,1179356.0,33.0,1296.0,6.0,3.0,1.0,3.0,30.0,3.0,2.0,...,4.0,7.0,Travel_Frequently,Maternity,Life Sciences,Male,Nurse,Divorced,Y,No
4,1135463.0,45.0,1015.0,5.0,5.0,1.0,3.0,50.0,1.0,2.0,...,1.0,4.0,Travel_Rarely,Maternity,Medical,Female,Nurse,Single,Y,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,1087770.0,45.0,930.0,9.0,3.0,1.0,4.0,74.0,3.0,3.0,...,0.0,2.0,Travel_Rarely,Cardiology,Marketing,Male,Nurse,Divorced,Y,Yes
1336,1080875.0,45.0,306.0,26.0,4.0,1.0,1.0,100.0,3.0,2.0,...,0.0,0.0,Travel_Frequently,Cardiology,Life Sciences,Female,Nurse,Married,Y,No
1337,1713058.0,50.0,316.0,8.0,4.0,1.0,4.0,54.0,3.0,1.0,...,2.0,2.0,Travel_Rarely,Cardiology,Marketing,Male,Other,Married,Y,No
1338,1675357.0,47.0,1093.0,9.0,3.0,1.0,3.0,82.0,1.0,4.0,...,14.0,10.0,Travel_Frequently,Cardiology,Life Sciences,Male,Nurse,Married,Y,No


In [30]:
X_test_imputed = pd.concat([X_test_imputed, X_test_obj_imp], axis=1, join='inner')

In [33]:
X_test_imputed

Unnamed: 0,EmployeeID,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,1476766.0,29.0,410.0,2.0,1.0,1.0,4.0,97.0,3.0,1.0,...,0.0,2.0,Travel_Frequently,Maternity,Life Sciences,Female,Nurse,Married,Y,No
1,1125672.0,36.0,329.0,2.0,3.0,1.0,4.0,96.0,3.0,1.0,...,2.0,2.0,Travel_Rarely,Maternity,Life Sciences,Female,Other,Married,Y,No
2,1602610.0,36.0,1040.0,3.0,2.0,1.0,4.0,79.0,4.0,2.0,...,0.0,4.0,Travel_Rarely,Maternity,Life Sciences,Male,Nurse,Divorced,Y,No
3,1236111.0,32.0,601.0,7.0,5.0,1.0,4.0,97.0,3.0,2.0,...,0.0,3.0,Travel_Rarely,Cardiology,Marketing,Male,Nurse,Married,Y,No
4,1810061.0,33.0,508.0,10.0,3.0,1.0,2.0,46.0,2.0,2.0,...,0.0,1.0,Travel_Frequently,Cardiology,Marketing,Male,Nurse,Single,Y,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,1649314.0,37.0,1372.0,1.0,3.0,1.0,4.0,42.0,3.0,1.0,...,5.0,7.0,Travel_Rarely,Maternity,Life Sciences,Female,Other,Single,Y,No
332,1544429.0,42.0,810.0,23.0,5.0,1.0,1.0,44.0,3.0,4.0,...,0.0,0.0,Travel_Rarely,Maternity,Life Sciences,Female,Administrative,Single,Y,No
333,1342581.0,43.0,559.0,10.0,4.0,1.0,3.0,82.0,2.0,2.0,...,0.0,0.0,Travel_Frequently,Maternity,Life Sciences,Female,Nurse,Divorced,Y,No
334,1317075.0,40.0,1492.0,20.0,4.0,1.0,1.0,61.0,3.0,3.0,...,11.0,1.0,Travel_Rarely,Maternity,Technical Degree,Male,Nurse,Married,Y,No


In [None]:
# test
X_train_norm, X_test_norm = normalise_data(X_train, X_test, '1')

In [37]:
X_train_norm, X_test_norm = normalise_data(X_train_imputed, X_test_imputed, '2')

In [32]:
X_train_imputed

Unnamed: 0,EmployeeID,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1469038.0,42.0,933.0,29.0,3.0,1.0,2.0,98.0,3.0,2.0,...,4.0,80.0,1.0,10.0,3.0,2.0,9.0,8.0,7.0,8.0
1,1882668.0,42.0,319.0,24.0,3.0,1.0,4.0,56.0,3.0,3.0,...,4.0,80.0,1.0,10.0,5.0,2.0,10.0,9.0,5.0,8.0
2,1322829.0,29.0,665.0,15.0,3.0,1.0,3.0,60.0,3.0,1.0,...,1.0,80.0,0.0,6.0,1.0,3.0,6.0,5.0,1.0,5.0
3,1179356.0,33.0,1296.0,6.0,3.0,1.0,3.0,30.0,3.0,2.0,...,3.0,80.0,1.0,15.0,2.0,1.0,13.0,11.0,4.0,7.0
4,1135463.0,45.0,1015.0,5.0,5.0,1.0,3.0,50.0,1.0,2.0,...,1.0,80.0,0.0,10.0,3.0,3.0,10.0,7.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,1087770.0,45.0,930.0,9.0,3.0,1.0,4.0,74.0,3.0,3.0,...,3.0,80.0,1.0,18.0,2.0,3.0,5.0,4.0,0.0,2.0
1336,1080875.0,45.0,306.0,26.0,4.0,1.0,1.0,100.0,3.0,2.0,...,4.0,80.0,2.0,5.0,4.0,3.0,1.0,1.0,0.0,0.0
1337,1713058.0,50.0,316.0,8.0,4.0,1.0,4.0,54.0,3.0,1.0,...,4.0,80.0,1.0,4.0,2.0,3.0,2.0,2.0,2.0,2.0
1338,1675357.0,47.0,1093.0,9.0,3.0,1.0,3.0,82.0,1.0,4.0,...,3.0,80.0,0.0,25.0,3.0,1.0,23.0,5.0,14.0,10.0


In [38]:
X_train_norm

Unnamed: 0,EmployeeID,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.515398,0.571429,0.594846,1.000000,0.50,0.0,0.333333,0.971429,0.666667,0.25,...,1.000000,0.0,0.333333,0.250,0.500000,0.333333,0.225,0.444444,0.466667,0.470588
1,0.995692,0.571429,0.155333,0.821429,0.50,0.0,1.000000,0.371429,0.666667,0.50,...,1.000000,0.0,0.333333,0.250,0.833333,0.333333,0.250,0.500000,0.333333,0.470588
2,0.345624,0.261905,0.403006,0.500000,0.50,0.0,0.666667,0.428571,0.666667,0.00,...,0.000000,0.0,0.000000,0.150,0.166667,0.666667,0.150,0.277778,0.066667,0.294118
3,0.179028,0.357143,0.854689,0.178571,0.50,0.0,0.666667,0.000000,0.666667,0.25,...,0.666667,0.0,0.333333,0.375,0.333333,0.000000,0.325,0.611111,0.266667,0.411765
4,0.128061,0.642857,0.653543,0.142857,1.00,0.0,0.666667,0.285714,0.000000,0.25,...,0.000000,0.0,0.000000,0.250,0.500000,0.666667,0.250,0.388889,0.066667,0.235294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,0.072681,0.642857,0.592699,0.285714,0.50,0.0,1.000000,0.628571,0.666667,0.50,...,0.666667,0.0,0.333333,0.450,0.333333,0.666667,0.125,0.222222,0.000000,0.117647
1336,0.064675,0.642857,0.146027,0.892857,0.75,0.0,0.000000,1.000000,0.666667,0.25,...,1.000000,0.0,0.666667,0.125,0.666667,0.666667,0.025,0.055556,0.000000,0.000000
1337,0.798746,0.761905,0.153185,0.250000,0.75,0.0,1.000000,0.342857,0.666667,0.00,...,1.000000,0.0,0.333333,0.100,0.333333,0.666667,0.050,0.111111,0.133333,0.117647
1338,0.754969,0.690476,0.709377,0.285714,0.50,0.0,0.666667,0.742857,0.000000,0.75,...,0.666667,0.0,0.000000,0.625,0.500000,0.000000,0.575,0.277778,0.933333,0.588235


In [None]:
X_train_norm

In [None]:
X_test_norm.columns = list(X_test_norm.columns)

In [None]:
X_train_norm.columns = list(X_train_norm.columns)

In [None]:
X_train_norm