In [47]:
import pandas as pd
import numpy as np
import yaml
import os
import tqdm
import src.util as util

# 1. Load config data

In [24]:
config_data = util.load_config()

# 2. Load raw data

In [28]:
def read_raw_data(config: dict) -> pd.DataFrame:
    raw_data_path = config['raw_dataset_path']
    selected_columns = config['selected_columns']
    df = pd.read_csv(raw_data_path)[selected_columns]
    return df

In [29]:
raw_dataset = read_raw_data(config_data)
raw_dataset.sample(10)

Unnamed: 0,Attrition,Department,JobRole,JobLevel,Gender,Age,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,MonthlyIncome,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,PerformanceRating
227,No,Sales,Sales Executive,3,Female,29,11,10,4,1,7918,2,4,3,3
302,No,Research & Development,Healthcare Representative,2,Male,28,8,3,0,7,5661,2,1,3,3
1179,No,Research & Development,Research Scientist,2,Female,34,11,8,7,9,5433,4,2,3,3
587,No,Research & Development,Laboratory Technician,2,Female,52,5,2,1,4,3149,4,3,3,4
1008,No,Research & Development,Research Director,4,Female,54,20,7,12,7,17328,4,4,2,3
1336,No,Research & Development,Research Scientist,1,Male,55,5,2,0,4,2662,2,4,4,4
1427,No,Research & Development,Laboratory Technician,1,Male,40,1,0,0,0,2406,1,4,2,3
942,No,Research & Development,Healthcare Representative,3,Female,36,7,7,1,7,7094,4,3,3,3
861,No,Sales,Manager,4,Female,46,26,15,15,9,17048,3,1,3,4
1122,No,Research & Development,Laboratory Technician,1,Male,29,10,9,1,5,4723,2,1,3,3


In [30]:
# Save raw dataset to file
util.pickle_dump(raw_dataset, config_data["raw_dataset_collected_path"])

# 3. Data Definition

To simplify the project, we have selected several attributes to be included:

Target:
1. Attrition: This is a binary variable indicating whether the employee has left the company or not. This is the primary target variable for most analyses using this dataset.


Employee’s basic information:
1. Department: This categorical variable represents the department in which an employee works, such as 'Sales', 'Research & Development', or 'Human Resources'.
2. JobRole: This categorical variable indicates the role of the employee within the company.
3. JobLevel: This is an ordinal variable that represents the level of job held by the employee, where a higher number indicates a more senior role.
4. Gender: This categorical variable indicates the gender of the employee.
5. Age: This is the age of the employee, measured in years.


Employee’s work experience and income:
1. YearsAtCompany: This is the number of years an employee has worked at the current company.
2. YearsInCurrentRole: This is the number of years an employee has been in their current role within the company.
3. YearsSinceLastPromotion: This is the number of years since the employee's last promotion.
4. YearsWithCurrManager: This is the number of years an employee has been with their current manager.
5. MonthlyIncome: This is the monthly income of the employee.


Employee’s satisfaction and performance rating:
1. EnvironmentSatisfaction: This ordinal variable reflects the employee's satisfaction with the work environment, where 1 'Low', 2 'Medium', 3 'High', and 4 'Very High'.
2. JobSatisfaction: This ordinal variable reflects the employee's satisfaction with their job, where 1 'Low', 2 'Medium', 3 'High', and 4 'Very High'.
3. WorkLifeBalance: This ordinal variable reflects the employee's work-life balance, where 1 'Bad', 2 'Good', 3 'Better', and 4 'Best'.
4. PerformanceRating: This ordinal variable reflects the employee's performance rating, where 1 'Low', 2 'Good', 3 'Excellent', and 4 'Outstanding'.


# 4. Data Validation

In [44]:
# Check data dimension
print(f'Rows : {raw_dataset.shape[0]}')
print(f'Cols : {raw_dataset.shape[1]}')

# Check data type each variable
print('\nColumns types:')
display(raw_dataset.info())

Rows : 1470
Cols : 15

Columns types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Attrition                1470 non-null   object
 1   Department               1470 non-null   object
 2   JobRole                  1470 non-null   object
 3   JobLevel                 1470 non-null   int64 
 4   Gender                   1470 non-null   object
 5   Age                      1470 non-null   int64 
 6   YearsAtCompany           1470 non-null   int64 
 7   YearsInCurrentRole       1470 non-null   int64 
 8   YearsSinceLastPromotion  1470 non-null   int64 
 9   YearsWithCurrManager     1470 non-null   int64 
 10  MonthlyIncome            1470 non-null   int64 
 11  EnvironmentSatisfaction  1470 non-null   int64 
 12  JobSatisfaction          1470 non-null   int64 
 13  WorkLifeBalance          1470 non-null   int64 
 14  Pe

None

In [32]:
# Check the range of data for each variable
raw_dataset.describe()

Unnamed: 0,JobLevel,Age,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,MonthlyIncome,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,PerformanceRating
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,2.063946,36.92381,7.008163,4.229252,2.187755,4.123129,6502.931293,2.721769,2.728571,2.761224,3.153741
std,1.10694,9.135373,6.126525,3.623137,3.22243,3.568136,4707.956783,1.093082,1.102846,0.706476,0.360824
min,1.0,18.0,0.0,0.0,0.0,0.0,1009.0,1.0,1.0,1.0,3.0
25%,1.0,30.0,3.0,2.0,0.0,2.0,2911.0,2.0,2.0,2.0,3.0
50%,2.0,36.0,5.0,3.0,1.0,3.0,4919.0,3.0,3.0,3.0,3.0
75%,3.0,43.0,9.0,7.0,3.0,7.0,8379.0,4.0,4.0,3.0,3.0
max,5.0,60.0,40.0,18.0,15.0,17.0,19999.0,4.0,4.0,4.0,4.0


In [50]:
object_columns = [
    'Department', 
    'JobRole', 
    'Gender']

int64_columns = [
    'JobLevel', 
    'Age', 
    'YearsAtCompany', 
    'YearsInCurrentRole', 
    'YearsSinceLastPromotion', 
    'YearsWithCurrManager', 
    'MonthlyIncome', 
    'EnvironmentSatisfaction', 
    'JobSatisfaction', 
    'WorkLifeBalance',
    'PerformanceRating']

columns_config = {
    'object_columns' : object_columns,
    'int64_columns' : int64_columns
}

print(yaml.dump(columns_config))

int64_columns:
- JobLevel
- Age
- YearsAtCompany
- YearsInCurrentRole
- YearsSinceLastPromotion
- YearsWithCurrManager
- MonthlyIncome
- EnvironmentSatisfaction
- JobSatisfaction
- WorkLifeBalance
- PerformanceRating
object_columns:
- Department
- JobRole
- Gender



In [65]:
object_column_range = {}

for col in object_columns:
    object_column_range[f'{col}'] = list(raw_dataset[col].unique())

print(yaml.dump(object_column_range))

Department:
- Sales
- Research & Development
- Human Resources
Gender:
- Female
- Male
JobRole:
- Sales Executive
- Research Scientist
- Laboratory Technician
- Manufacturing Director
- Healthcare Representative
- Manager
- Sales Representative
- Research Director
- Human Resources



In [87]:
int64_column_range = {}

for col in int64_columns:
    int64_column_range[f'{col}'] = {
        'min' : min(raw_dataset[col]), 
        'max' : max(raw_dataset[col])
    }

print(yaml.dump(int64_column_range))

Age:
  max: 60
  min: 18
EnvironmentSatisfaction:
  max: 4
  min: 1
JobLevel:
  max: 5
  min: 1
JobSatisfaction:
  max: 4
  min: 1
MonthlyIncome:
  max: 19999
  min: 1009
PerformanceRating:
  max: 4
  min: 3
WorkLifeBalance:
  max: 4
  min: 1
YearsAtCompany:
  max: 40
  min: 0
YearsInCurrentRole:
  max: 18
  min: 0
YearsSinceLastPromotion:
  max: 15
  min: 0
YearsWithCurrManager:
  max: 17
  min: 0

