## --- 0.0. Libraries importing ---

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# set notebook width to 100%
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# dataframe manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# train test split
from sklearn.model_selection import train_test_split

# Path
import os 
from pathlib import Path
os.chdir(Path(os.getcwd()).parent)

## --- 0.1. Parameters ---

### ------ 0.1.1. Independent parameters ------

In [2]:
file_name = 'Employee'
uc_name = 'employees'

target_column = 'LeaveOrNot'
seed = 1999
test_size = 0.3

# 1. Data Loading

In [3]:
df = (pd
      .read_csv(f'data/raw_data/{file_name}.csv',
                  low_memory=False)
     )

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Masters,2018,New Delhi,3,28,Female,No,2,1
1,Bachelors,2018,New Delhi,3,36,Female,Yes,2,1
2,Bachelors,2017,Bangalore,3,28,Male,No,1,0
3,Bachelors,2014,Bangalore,3,33,Male,No,1,1
4,Bachelors,2017,Pune,2,24,Male,No,2,1


Shape of the data: (3200, 9).


# 2. Data Transformation

In [4]:
(
    df
    .drop(columns=['Education', 'City'],
          inplace=True)
)

np.random.seed(seed)
df = (df
      # numerical
      .assign(Irrelevant_Predictor1=np.random.uniform(0, 1, size=len(df)))
      .assign(Irrelevant_Predictor2=np.random.uniform(0, 5, size=(len(df))))
      # categorical
      .assign(Irrelevant_Predictor3=np.random.choice(['a1', 'a2'], size=len(df)))
     )

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,JoiningYear,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,Irrelevant_Predictor1,Irrelevant_Predictor2,Irrelevant_Predictor3
0,2018,3,28,Female,No,2,1,0.82452,4.034387,a1
1,2018,3,36,Female,Yes,2,1,0.984627,3.445169,a2
2,2017,3,28,Male,No,1,0,0.893145,3.348348,a2
3,2014,3,33,Male,No,1,1,0.319647,3.399166,a2
4,2017,2,24,Male,No,2,1,0.593257,4.171233,a2


Shape of the data: (3200, 10).


# 3. Train/Test Split

In [5]:
X = df.drop(columns=target_column,
            axis=1)

y = df[target_column].copy()

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=seed,
                                                    stratify=y)

X_train, X_test, y_train, y_test = (
                                        X_train.reset_index(drop=True),
                                        X_test.reset_index(drop=True),
                                        y_train.reset_index(drop=True),
                                        y_test.reset_index(drop=True)
                                    )

print(f"Shape -> X_train: {X_train.shape}, y_train: {y_train.shape}.")
print(f"Shape -> X_test: {X_test.shape}, y_test: {y_test.shape}.")

Shape -> X_train: (2240, 9), y_train: (2240,).
Shape -> X_test: (960, 9), y_test: (960,).


# 4. Save Prepared Data

In [6]:
(
    X_train
    .to_csv(f'data/prepared_data/{uc_name}/X_train.csv',
            index=False)
)

print("X_train.csv is successfully saved!")

(
    y_train
    .to_csv(f'data/prepared_data/{uc_name}/y_train.csv',
            index=False)
)

print("y_train.csv is successfully saved!")

(
    X_test
    .to_csv(f'data/prepared_data/{uc_name}/X_test.csv',
            index=False)
)

print("X_test.csv is successfully saved!")

(
    y_test
    .to_csv(f'data/prepared_data/{uc_name}/y_test.csv',
            index=False)
)

print("y_test.csv is successfully saved!")

X_train.csv is successfully saved!
y_train.csv is successfully saved!
X_test.csv is successfully saved!
y_test.csv is successfully saved!
