Data Encoding - Conversion of Categorical Data Into Numerical Data

In [27]:
# !pip install scikit-learn

In [1]:
import sys, os
lib_path = os.path.abspath('../libs')
sys.path.append(lib_path)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from InputOptimization import InputOptimization

In [3]:
io = InputOptimization()

In [4]:
df = pd.read_csv('../00_Data/04_NoOutliers.csv')
df.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,Female,Millennials,Career Growth,Junior,Bengaluru,Below 30,26 To 30,102292,,AC,59.0
1,Female,Millennials,Career Growth,Mid,Bengaluru,31 to 35,26 To 30,100411,,AD,4.0
2,Male,Millennials,Career Growth,Junior,Bengaluru,Below 30,26 To 30,102304,,AG,26.0
3,Male,Gen X,Career Growth,Senior,Bengaluru,Above 35,36 T0 40,102324,,AI,126.0
4,Male,Millennials,Personal Reason,Mid,Bengaluru,31 to 35,31 To 35,100619,,AI,126.0


In [5]:
df.shape

(552, 11)

In [6]:
io.validate_types(df)

Unnamed: 0,Column,Data Type,Unique Values
7,Manager ID,numerical,207
10,active_months,numerical,130
9,Department,categorical,58
2,Term Sub Reason,categorical,16
6,Age Bucket,categorical,7
4,Location,categorical,6
1,Generation,categorical,4
8,Talent,categorical,4
3,Employee Grade,categorical,3
5,Career Bucket,categorical,3


Encoding Binary Columns

In [7]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [8]:
def binary_encoding(df:pd.DataFrame, column:str, true_value:str):
    """Binary Encoding"""

    unique_values = df[column].unique()

    for val in unique_values:
        if val == true_value:
            df.loc[:, column] = df[column].replace(val, 1).infer_objects(copy=False)
        else:
            df.loc[:, column] = df[column].replace(val, 0).infer_objects(copy=False)

    return df

In [None]:
df = binary_encoding(df, 'Gender', 'Female')
df.head()

In [10]:
df['Talent'].unique()

array([nan, 'C', 'B', 'D', 'A'], dtype=object)

Columns with more than 2 Unique Categorical Values

1. Label Encoding

In [11]:
label_cols = ['Term Sub Reason', 'Location', 'Talent', 'Department']

In [12]:
# Label Encoding - If values are not in order
def label_encoding(df:pd.DataFrame, columns:list):
    """Label Encoding Function"""

    le = LabelEncoder()
    for column in columns:
        df[column] = le.fit_transform(df[column])
        df[column] = df[column] + 1
        df[column] = pd.to_numeric(df[column])
    
    return df

In [13]:
df['Term Sub Reason'].unique()

array(['Career Growth', 'Personal Reason', 'Relocated to Other Branch',
       'Salary Hike', 'Higher Studies', 'Relocation to Native',
       'Internal Politics', 'Company Rules', 'Health Issues',
       'Work Life Balance', 'Marriage', 'Not Suitable Job for the Skills',
       'Miscellaneous', 'Family Problems', 'Not Interested in JOB',
       'Work Stress'], dtype=object)

In [14]:
df = label_encoding(df, label_cols)
df.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,Millennials,1,Junior,1,Below 30,26 To 30,102292,5,2,59.0
1,1,Millennials,1,Mid,1,31 to 35,26 To 30,100411,5,3,4.0
2,0,Millennials,1,Junior,1,Below 30,26 To 30,102304,5,5,26.0
3,0,Gen X,1,Senior,1,Above 35,36 T0 40,102324,5,7,126.0
4,0,Millennials,11,Mid,1,31 to 35,31 To 35,100619,5,7,126.0


In [15]:
df['Term Sub Reason'].unique()

array([ 1, 11, 12, 14,  5, 13,  6,  2,  4, 15,  7, 10,  8,  3,  9, 16])

2. Ordinal Encoding - For Ordered Values

In [16]:
df.head()

Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,Millennials,1,Junior,1,Below 30,26 To 30,102292,5,2,59.0
1,1,Millennials,1,Mid,1,31 to 35,26 To 30,100411,5,3,4.0
2,0,Millennials,1,Junior,1,Below 30,26 To 30,102304,5,5,26.0
3,0,Gen X,1,Senior,1,Above 35,36 T0 40,102324,5,7,126.0
4,0,Millennials,11,Mid,1,31 to 35,31 To 35,100619,5,7,126.0


In [17]:
df['Generation'].unique()

array(['Millennials', 'Gen X', 'Baby Boomers', 'Gen Z'], dtype=object)

In [18]:
def ordinal_encoding(df:pd.DataFrame, column:str, order_list:list):
    """Encoding Ordered Data"""

    data = df.copy()
    count = 1
    for item in order_list:
        data[column] = data[column].replace(item, count)
        count = count +1 

    data[column] = pd.to_numeric(data[column])
    return data

In [None]:
gen_list = ['Gen Z', 'Millennials', 'Gen X', 'Baby Boomers']
df = ordinal_encoding(df, 'Generation', gen_list)
df.head()

In [20]:
df['Employee Grade'].unique()

array(['Junior', 'Mid', 'Senior'], dtype=object)

In [21]:
gen_list = ['Junior', 'Mid', 'Senior']
df = ordinal_encoding(df, 'Employee Grade', gen_list)
df.head()

  data[column] = data[column].replace(item, count)


Unnamed: 0,Gender,Generation,Term Sub Reason,Employee Grade,Location,Career Bucket,Age Bucket,Manager ID,Talent,Department,active_months
0,1,2,1,1,1,Below 30,26 To 30,102292,5,2,59.0
1,1,2,1,2,1,31 to 35,26 To 30,100411,5,3,4.0
2,0,2,1,1,1,Below 30,26 To 30,102304,5,5,26.0
3,0,3,1,3,1,Above 35,36 T0 40,102324,5,7,126.0
4,0,2,11,2,1,31 to 35,31 To 35,100619,5,7,126.0


In [22]:
df['Career Bucket'].unique()

array(['Below 30', '31 to 35', 'Above 35'], dtype=object)

In [None]:
gen_list = ['Below 30', '31 to 35', 'Above 35']
df = ordinal_encoding(df, 'Career Bucket', gen_list)
df.head()

In [24]:
df['Age Bucket'].unique()

array(['26 To 30', '36 T0 40', '31 To 35', '21 To 25', '41 T0 45',
       '46 T0 50', '51 T0 55'], dtype=object)

In [None]:
order_list = ['21 To 25', '26 To 30', '31 To 35', '36 T0 40',  '41 T0 45', '46 T0 50', '51 T0 55']
df = ordinal_encoding(df, 'Age Bucket', order_list)
df.head()

In [28]:
df.to_csv('../00_Data/05_Encoded.csv', index=False)