In [155]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
import pickle
import joblib


In [156]:
## Load dataset
data=pd.read_csv("test.csv")
data.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,52685,36,Male,13,Healthcare,8029,Excellent,High,Average,1,...,1,Mid,Large,22,No,No,No,Poor,Medium,Stayed
1,30585,35,Male,7,Education,4563,Good,High,Average,1,...,4,Entry,Medium,27,No,No,No,Good,High,Left
2,54656,50,Male,7,Education,5583,Fair,High,Average,3,...,2,Senior,Medium,76,No,No,Yes,Good,Low,Stayed
3,33442,58,Male,44,Media,5525,Fair,Very High,High,0,...,4,Entry,Medium,96,No,No,No,Poor,Low,Left
4,15667,39,Male,24,Education,4604,Good,High,Average,0,...,6,Mid,Large,45,Yes,No,No,Good,High,Stayed


In [157]:
data=data.drop(['Employee ID'],axis =1)
data

Unnamed: 0,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,36,Male,13,Healthcare,8029,Excellent,High,Average,1,Yes,...,1,Mid,Large,22,No,No,No,Poor,Medium,Stayed
1,35,Male,7,Education,4563,Good,High,Average,1,Yes,...,4,Entry,Medium,27,No,No,No,Good,High,Left
2,50,Male,7,Education,5583,Fair,High,Average,3,Yes,...,2,Senior,Medium,76,No,No,Yes,Good,Low,Stayed
3,58,Male,44,Media,5525,Fair,Very High,High,0,Yes,...,4,Entry,Medium,96,No,No,No,Poor,Low,Left
4,39,Male,24,Education,4604,Good,High,Average,0,Yes,...,6,Mid,Large,45,Yes,No,No,Good,High,Stayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,56,Female,42,Healthcare,7830,Poor,Medium,Average,0,Yes,...,0,Senior,Medium,60,No,No,No,Poor,Medium,Stayed
14896,30,Female,15,Education,3856,Good,Medium,Average,2,Yes,...,0,Entry,Medium,20,No,No,No,Good,Medium,Left
14897,52,Male,5,Education,5654,Good,Very High,Below Average,0,No,...,4,Mid,Small,7,No,No,No,Good,High,Left
14898,18,Male,4,Education,5276,Fair,High,Average,0,No,...,3,Mid,Large,5,No,No,No,Poor,High,Stayed


In [158]:

# List of columns to label encode
columns_to_encode = ['Gender', 'Overtime','Remote Work','Leadership Opportunities','Innovation Opportunities','Attrition']

# Apply LabelEncoder to each column
for col in columns_to_encode:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    joblib.dump(le, f'label_encoder_{col.lower().replace(" ", "_")}.pkl') 
    

In [159]:
data

Unnamed: 0,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,Overtime,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,36,1,13,Healthcare,8029,Excellent,High,Average,1,1,...,1,Mid,Large,22,0,0,0,Poor,Medium,1
1,35,1,7,Education,4563,Good,High,Average,1,1,...,4,Entry,Medium,27,0,0,0,Good,High,0
2,50,1,7,Education,5583,Fair,High,Average,3,1,...,2,Senior,Medium,76,0,0,1,Good,Low,1
3,58,1,44,Media,5525,Fair,Very High,High,0,1,...,4,Entry,Medium,96,0,0,0,Poor,Low,0
4,39,1,24,Education,4604,Good,High,Average,0,1,...,6,Mid,Large,45,1,0,0,Good,High,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,56,0,42,Healthcare,7830,Poor,Medium,Average,0,1,...,0,Senior,Medium,60,0,0,0,Poor,Medium,1
14896,30,0,15,Education,3856,Good,Medium,Average,2,1,...,0,Entry,Medium,20,0,0,0,Good,Medium,0
14897,52,1,5,Education,5654,Good,Very High,Below Average,0,0,...,4,Mid,Small,7,0,0,0,Good,High,0
14898,18,1,4,Education,5276,Fair,High,Average,0,0,...,3,Mid,Large,5,0,0,0,Poor,High,1


In [160]:


# Step 1: Define categorical columns to encode
columns_to_encode = [
    'Company Size', 'Job Role', 'Work-Life Balance', 'Job Satisfaction',
    'Performance Rating', 'Education Level', 'Marital Status',
    'Job Level', 'Company Reputation', 'Employee Recognition']


 # Step 2: OneHotEncoder with sparse=False to get a dense array
encoder = OneHotEncoder(sparse_output=False , handle_unknown='ignore')

# Step 3: Fit and transform the data
encoded_array = encoder.fit_transform(data[columns_to_encode])

# Step 4: Create DataFrame with encoded column names
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(columns_to_encode))

# Step 5: Reset index to align DataFrames before concatenation
encoded_df.index = data.index  # Align indices

# Step 6: Drop original columns and combine with encoded
data_encoded = pd.concat([data.drop(columns=columns_to_encode), encoded_df], axis=1)

## Step 7: Save the emcoders 
cols_str = "_".join([col.lower().replace(" ", "_") for col in columns_to_encode])
filename = f"onehot_encoder_{cols_str}.pkl"
joblib.dump(encoder, filename)

# Step 8: Show result
print("✅ Final Encoded DataFrame:")
print(data_encoded.head())

✅ Final Encoded DataFrame:
   Age  Gender  Years at Company  Monthly Income  Number of Promotions  \
0   36       1                13            8029                     1   
1   35       1                 7            4563                     1   
2   50       1                 7            5583                     3   
3   58       1                44            5525                     0   
4   39       1                24            4604                     0   

   Overtime  Distance from Home  Number of Dependents  Company Tenure  \
0         1                  83                     1              22   
1         1                  55                     4              27   
2         1                  14                     2              76   
3         1                  43                     4              96   
4         1                  47                     6              45   

   Remote Work  ...  Job Level_Mid  Job Level_Senior  \
0            0  ...            1.

In [161]:
data_encoded


Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Number of Promotions,Overtime,Distance from Home,Number of Dependents,Company Tenure,Remote Work,...,Job Level_Mid,Job Level_Senior,Company Reputation_Excellent,Company Reputation_Fair,Company Reputation_Good,Company Reputation_Poor,Employee Recognition_High,Employee Recognition_Low,Employee Recognition_Medium,Employee Recognition_Very High
0,36,1,13,8029,1,1,83,1,22,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,35,1,7,4563,1,1,55,4,27,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,50,1,7,5583,3,1,14,2,76,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,58,1,44,5525,0,1,43,4,96,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,39,1,24,4604,0,1,47,6,45,1,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,56,0,42,7830,0,1,40,0,60,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
14896,30,0,15,3856,2,1,45,0,20,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14897,52,1,5,5654,0,0,4,4,7,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
14898,18,1,4,5276,0,0,13,3,5,0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [162]:
## Divide the data in dependent and independent features
X= data_encoded.drop('Attrition',axis =1)
y=data_encoded['Attrition']

## Split data in training and testing sets

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

##Scale these features
scaler=StandardScaler()
X_Train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [163]:
 data_encoded

Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Number of Promotions,Overtime,Distance from Home,Number of Dependents,Company Tenure,Remote Work,...,Job Level_Mid,Job Level_Senior,Company Reputation_Excellent,Company Reputation_Fair,Company Reputation_Good,Company Reputation_Poor,Employee Recognition_High,Employee Recognition_Low,Employee Recognition_Medium,Employee Recognition_Very High
0,36,1,13,8029,1,1,83,1,22,0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,35,1,7,4563,1,1,55,4,27,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,50,1,7,5583,3,1,14,2,76,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,58,1,44,5525,0,1,43,4,96,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,39,1,24,4604,0,1,47,6,45,1,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14895,56,0,42,7830,0,1,40,0,60,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
14896,30,0,15,3856,2,1,45,0,20,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14897,52,1,5,5654,0,0,4,4,7,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
14898,18,1,4,5276,0,0,13,3,5,0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [164]:
##Save the emcoders and scaler
with open('scaler.pkl','wb') as file:
          pickle.dump(scaler,file)


ANN Implementation

In [165]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [166]:
## Build ANN model
model=Sequential([Dense(64,activation='relu',input_shape=(X_train.shape[1],)), ##HL! connected with input layer
                  Dense(32,activation='relu'), ##HL@
                  Dense(1,activation='sigmoid')]) ##Output layer

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [167]:
model.summary()

In [168]:
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.5)
loss=tensorflow.keras.losses.BinaryCrossentropy()


In [169]:
model.compile(optimizer=opt,loss='binary_crossentropy',metrics=['accuracy'])

In [170]:
##Setup Tensorboard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

log_dir='logs/fit/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)

In [171]:
## Setup Early stoppping

early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [172]:
history=model.fit(
    X_Train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5067 - loss: 4.8403 - val_accuracy: 0.5332 - val_loss: 0.7325
Epoch 2/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5065 - loss: 0.7048 - val_accuracy: 0.5332 - val_loss: 0.6931
Epoch 3/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5023 - loss: 0.6998 - val_accuracy: 0.5332 - val_loss: 0.6980
Epoch 4/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5049 - loss: 0.7010 - val_accuracy: 0.5332 - val_loss: 0.6929
Epoch 5/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5078 - loss: 0.7014 - val_accuracy: 0.4664 - val_loss: 0.7326
Epoch 6/100
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5144 - loss: 0.7015 - val_accuracy: 0.5332 - val_loss: 0.6936
Epoch 7/100
[1m373/37

In [173]:
model.save('model.h5')
#model.save("model.keras", save_format="keras_v3")



In [174]:
## Load Tensorboard Extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [175]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 11792), started 20:04:21 ago. (Use '!kill 11792' to kill it.)

In [176]:
## Load the pickle file

