In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 3000)

In [4]:

df = pd.read_csv("../Data/EmployeeAttrition.csv")

In [None]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], dtype='object')

# Encoding

Convert string data to int / float format

Most of the ML / DL algorithms / tools they don't support handling string data directly ... so we have to convert string to int/float

### 2 ways

#### 1. Label Encoding for Ordinal Data

#### 2. Onehot Encoding for Nominal Data

# Lable Encoding

convert string values to 0,1,2,...n

1. First make list of ordinal columns which are in string format

'BusinessTravel'

2. Apply label encoding

    a. create a label map for every ordinal column
    Label map is a dictionary with column name, for each column label name and value mapping

    ex. grade column
    labelmap = {'grade' : {'A+' : 3, 'A':2,'B':1,'C' : 0}}

    b. Use replace function of pandas dataframe
    (remember :: df.replace automatically changes the column data type to int)

#### NOTE: sklearn gives label encoding function but it is very limited and NOT used practically

In [None]:
df['BusinessTravel'].unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)

In [None]:
label_map = {'BusinessTravel' : {'Non-Travel' : 0, 'Travel_Rarely':1, 'Travel_Frequently':2}}

In [None]:
df.replace(label_map).head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,1,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,2,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,1,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,2,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,1,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


# One hot Encoding

1. first find the columns which are nominal

'Attrition','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'

2. convert them to one hot encoding with drop_first = True (one value less)

In [None]:
df['Department'].unique()

array(['Sales', 'Research & Development', 'Human Resources'], dtype=object)

In [None]:
df_ord = pd.get_dummies(df,columns=['Attrition','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'],drop_first=True)

In [None]:
df_ord.columns

Index(['Age', 'BusinessTravel', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition_Yes', 'Department_Research & Development', 'Department_Sales', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Male', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'Mari

In [None]:
df_ord.shape

(1470, 48)

# Practice on DT_data.csv

In [None]:
df_dt = pd.read_csv("DT_data.csv")

In [None]:
df_dt.columns

Index(['Unnamed: 0', 'Outlook', 'Temp', 'Humidity', 'Windy', 'Play Golf'], dtype='object')

In [None]:
df_dt.head()

Unnamed: 0.1,Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,1,Rainy,Hot,High,0,No
1,2,Rainy,Hot,High,1,No
2,3,Ovecast,Hot,High,0,Yes
3,4,Sunny,Mild,High,0,Yes
4,5,Sunny,Cool,Normal,0,Yes


# Label Encoding

In [None]:
label_map = {'Temp':{'Hot':2,'Mild':1,'Cool':0},
             'Humidity':{'High':1,'Normal':0}}

In [None]:
df_dt_ord = df_dt.replace(label_map)

In [None]:
df_dt_ord

Unnamed: 0.1,Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,1,Rainy,2,1,0,No
1,2,Rainy,2,1,1,No
2,3,Ovecast,2,1,0,Yes
3,4,Sunny,1,1,0,Yes
4,5,Sunny,0,0,0,Yes
5,6,Sunny,0,0,1,No
6,7,Ovecast,0,0,1,Yes
7,8,Rainy,1,1,0,No
8,9,Rainy,0,0,0,Yes
9,10,Sunny,1,0,0,Yes


# One Hot Encoding

In [None]:
df_dt_ord.select_dtypes(include=['object'])

Unnamed: 0,Outlook,Play Golf
0,Rainy,No
1,Rainy,No
2,Ovecast,Yes
3,Sunny,Yes
4,Sunny,Yes
5,Sunny,No
6,Ovecast,Yes
7,Rainy,No
8,Rainy,Yes
9,Sunny,Yes


In [None]:
# When columns are not given convert all columns with dtype 'object' to one hot encoding
df_dt_converted = pd.get_dummies(df_dt_ord,drop_first=True)

In [None]:
df_dt_converted.columns

Index(['Unnamed: 0', 'Temp', 'Humidity', 'Windy', 'Outlook_Rainy', 'Outlook_Sunny', 'Play Golf_Yes'], dtype='object')