In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os

In [230]:
data_path = './'
df = pd.read_csv(os.path.join(data_path, 'WA_Fn-UseC_-HR-Employee-Attrition.csv'))
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [231]:
to_keep = {'Age', 'Attrition', 'Department','DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'Gender', 'JobSatisfaction', 'MaritalStatus',
           'MonthlyIncome', 'OverTime', 'PerformanceRating', 'RelationshipSatisfaction','TotalWorkingYears','YearsAtCompany'}
to_drop = set(df.columns)-to_keep
df.drop(to_drop, axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
OverTime                    1470 non-null object
PerformanceRating           1470 non-null int64
RelationshipSatisfaction    1470 non-null int64
TotalWorkingYears           1470 non-null int64
YearsAtCompany              1470 non-null int64
dtypes: int64(10), object(5)
memory usage: 143.6+ KB


In [232]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

to_encode = {'Attrition', 'Department','Gender','MaritalStatus','OverTime'}
encoders = dict()
for atr in to_encode:
    encoders[atr] = LabelEncoder()
    df[atr] = encoders[atr].fit_transform(df[atr] )

In [233]:
numerics = set(df.columns) - to_encode
for atr in numerics:
    df[atr] = df[atr].astype(np.float)    
    ss = StandardScaler()
    df[atr] = ss.fit_transform(df[atr].values.reshape(-1, 1))

In [234]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
Age                         1470 non-null float64
Attrition                   1470 non-null int32
Department                  1470 non-null int32
DistanceFromHome            1470 non-null float64
Education                   1470 non-null float64
EnvironmentSatisfaction     1470 non-null float64
Gender                      1470 non-null int32
JobSatisfaction             1470 non-null float64
MaritalStatus               1470 non-null int32
MonthlyIncome               1470 non-null float64
OverTime                    1470 non-null int32
PerformanceRating           1470 non-null float64
RelationshipSatisfaction    1470 non-null float64
TotalWorkingYears           1470 non-null float64
YearsAtCompany              1470 non-null float64
dtypes: float64(10), int32(5)
memory usage: 143.6 KB


For data expension, we will try several ways to do so. And first we are going to copy and append some rows to origin dataset and expand it to 2000 rows.

In [235]:
df_slice = df[:530]

In [236]:
df_slice

Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,YearsAtCompany
0,0.446350,1,2,-1.010909,-0.891688,-0.660531,0,1.153254,2,-0.108350,1,-0.426230,-1.584178,-0.421642,-0.164613
1,1.322365,0,1,-0.147150,-1.868426,0.254625,1,-0.660853,1,-0.291719,0,2.346151,1.191438,-0.164511,0.488508
2,0.008343,1,1,-0.887515,-0.891688,1.169781,1,0.246200,2,-0.937654,1,-0.426230,-0.658973,-0.550208,-1.144294
3,-0.429664,0,1,-0.764121,1.061787,1.169781,0,0.246200,1,-0.763634,1,-0.426230,0.266233,-0.421642,0.161947
4,-1.086676,0,1,-0.887515,-1.868426,-1.575686,1,-0.660853,1,-0.644858,0,-0.426230,1.191438,-0.678774,-0.817734
5,-0.539166,0,1,-0.887515,-0.891688,1.169781,1,1.153254,2,-0.729850,0,-0.426230,0.266233,-0.421642,-0.001333
6,2.417384,0,1,-0.764121,0.085049,0.254625,0,-1.567907,1,-0.814416,1,2.346151,-1.584178,0.092620,-0.981014
7,-0.758170,0,1,1.827158,-1.868426,1.169781,1,0.246200,0,-0.809529,0,2.346151,-0.658973,-1.321601,-0.981014
8,0.117845,0,1,1.703764,0.085049,1.169781,1,0.246200,2,0.642338,0,2.346151,-0.658973,-0.164511,0.325228
9,-0.101159,0,1,2.197341,0.085049,0.254625,1,0.246200,1,-0.268983,0,-0.426230,-0.658973,0.735447,-0.001333


In [237]:
df_new = df.append(df_slice)
df_new = df_new.reset_index(drop=True)

In [238]:
df_new

Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,YearsAtCompany
0,0.446350,1,2,-1.010909,-0.891688,-0.660531,0,1.153254,2,-0.108350,1,-0.426230,-1.584178,-0.421642,-0.164613
1,1.322365,0,1,-0.147150,-1.868426,0.254625,1,-0.660853,1,-0.291719,0,2.346151,1.191438,-0.164511,0.488508
2,0.008343,1,1,-0.887515,-0.891688,1.169781,1,0.246200,2,-0.937654,1,-0.426230,-0.658973,-0.550208,-1.144294
3,-0.429664,0,1,-0.764121,1.061787,1.169781,0,0.246200,1,-0.763634,1,-0.426230,0.266233,-0.421642,0.161947
4,-1.086676,0,1,-0.887515,-1.868426,-1.575686,1,-0.660853,1,-0.644858,0,-0.426230,1.191438,-0.678774,-0.817734
5,-0.539166,0,1,-0.887515,-0.891688,1.169781,1,1.153254,2,-0.729850,0,-0.426230,0.266233,-0.421642,-0.001333
6,2.417384,0,1,-0.764121,0.085049,0.254625,0,-1.567907,1,-0.814416,1,2.346151,-1.584178,0.092620,-0.981014
7,-0.758170,0,1,1.827158,-1.868426,1.169781,1,0.246200,0,-0.809529,0,2.346151,-0.658973,-1.321601,-0.981014
8,0.117845,0,1,1.703764,0.085049,1.169781,1,0.246200,2,0.642338,0,2.346151,-0.658973,-0.164511,0.325228
9,-0.101159,0,1,2.197341,0.085049,0.254625,1,0.246200,1,-0.268983,0,-0.426230,-0.658973,0.735447,-0.001333


In [239]:
df.max()

Age                         2.526886
Attrition                   1.000000
Department                  2.000000
DistanceFromHome            2.444129
Education                   2.038524
EnvironmentSatisfaction     1.169781
Gender                      1.000000
JobSatisfaction             1.153254
MaritalStatus               2.000000
MonthlyIncome               2.867626
OverTime                    1.000000
PerformanceRating           2.346151
RelationshipSatisfaction    1.191438
TotalWorkingYears           3.692454
YearsAtCompany              5.386914
dtype: float64

In [240]:
df.min()

Age                        -2.072192
Attrition                   0.000000
Department                  0.000000
DistanceFromHome           -1.010909
Education                  -1.868426
EnvironmentSatisfaction    -1.575686
Gender                      0.000000
JobSatisfaction            -1.567907
MaritalStatus               0.000000
MonthlyIncome              -1.167343
OverTime                    0.000000
PerformanceRating          -0.426230
RelationshipSatisfaction   -1.584178
TotalWorkingYears          -1.450167
YearsAtCompany             -1.144294
dtype: float64

In [242]:
df_slice2 = df_slice[['Age','DistanceFromHome','Education','EnvironmentSatisfaction','JobSatisfaction','MonthlyIncome','PerformanceRating','RelationshipSatisfaction','TotalWorkingYears','YearsAtCompany']]

We use df_slice2 to take numberical datas from the raw dataframe slice, and add some randomly generated noise to these numerical data.

In [243]:
df_slice2 = df_slice2 * (1 + np.random.uniform(-0.01,0.01,(df_slice2.shape)))

In [244]:
df_slice2.insert(1, 'Attrition', df_slice['Attrition'])
df_slice2.insert(2, 'Department', df_slice['Department'])
df_slice2.insert(6, 'Gender', df_slice['Gender'])
df_slice2.insert(8, 'MaritalStatus', df_slice['MaritalStatus'])
df_slice2.insert(10, 'OverTime', df_slice['OverTime'])

In [245]:
df_slice2

Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,YearsAtCompany
0,0.443180,1,2,-1.011936,-0.888173,-0.657699,0,1.154822,2,-0.108666,1,-0.427038,-1.581851,-0.424108,-0.164833
1,1.327912,0,1,-0.148205,-1.850394,0.254348,1,-0.657313,1,-0.289421,0,2.363321,1.189862,-0.165677,0.491328
2,0.008345,1,1,-0.883216,-0.891086,1.164833,1,0.247661,2,-0.938845,1,-0.422824,-0.659322,-0.550547,-1.152242
3,-0.429728,0,1,-0.767598,1.067304,1.179433,0,0.246976,1,-0.757215,1,-0.423901,0.268523,-0.424462,0.161799
4,-1.093976,0,1,-0.893891,-1.878732,-1.586058,1,-0.656016,1,-0.643304,0,-0.423341,1.202496,-0.680373,-0.818250
5,-0.538510,0,1,-0.892254,-0.886547,1.160970,1,1.150141,2,-0.726630,0,-0.425332,0.268335,-0.425182,-0.001330
6,2.435036,0,1,-0.771495,0.085769,0.254557,0,-1.576358,1,-0.812865,1,2.344860,-1.586001,0.092567,-0.974244
7,-0.762782,0,1,1.839170,-1.878880,1.178116,1,0.245270,0,-0.811958,0,2.335995,-0.658948,-1.325016,-0.978843
8,0.117867,0,1,1.719147,0.085239,1.171648,1,0.245541,2,0.639797,0,2.331032,-0.665086,-0.163589,0.327537
9,-0.101318,0,1,2.212993,0.084473,0.256447,1,0.246942,1,-0.269277,0,-0.424289,-0.654750,0.733745,-0.001336


In [246]:
df_new2 = df.append(df_slice2)
df_new2 = df_new2.reset_index(drop=True)

In [247]:
df_new2

Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,YearsAtCompany
0,0.446350,1,2,-1.010909,-0.891688,-0.660531,0,1.153254,2,-0.108350,1,-0.426230,-1.584178,-0.421642,-0.164613
1,1.322365,0,1,-0.147150,-1.868426,0.254625,1,-0.660853,1,-0.291719,0,2.346151,1.191438,-0.164511,0.488508
2,0.008343,1,1,-0.887515,-0.891688,1.169781,1,0.246200,2,-0.937654,1,-0.426230,-0.658973,-0.550208,-1.144294
3,-0.429664,0,1,-0.764121,1.061787,1.169781,0,0.246200,1,-0.763634,1,-0.426230,0.266233,-0.421642,0.161947
4,-1.086676,0,1,-0.887515,-1.868426,-1.575686,1,-0.660853,1,-0.644858,0,-0.426230,1.191438,-0.678774,-0.817734
5,-0.539166,0,1,-0.887515,-0.891688,1.169781,1,1.153254,2,-0.729850,0,-0.426230,0.266233,-0.421642,-0.001333
6,2.417384,0,1,-0.764121,0.085049,0.254625,0,-1.567907,1,-0.814416,1,2.346151,-1.584178,0.092620,-0.981014
7,-0.758170,0,1,1.827158,-1.868426,1.169781,1,0.246200,0,-0.809529,0,2.346151,-0.658973,-1.321601,-0.981014
8,0.117845,0,1,1.703764,0.085049,1.169781,1,0.246200,2,0.642338,0,2.346151,-0.658973,-0.164511,0.325228
9,-0.101159,0,1,2.197341,0.085049,0.254625,1,0.246200,1,-0.268983,0,-0.426230,-0.658973,0.735447,-0.001333
