In [63]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle
import numpy as np

In [64]:
# Load the dataset

data  = pd.read_csv("salary prediction.csv")

data.head()

Unnamed: 0,FIRST NAME,LAST NAME,SEX,DOJ,CURRENT DATE,DESIGNATION,AGE,SALARY,UNIT,LEAVES USED,LEAVES REMAINING,RATINGS,PAST EXP
0,TOMASA,ARMEN,F,5-18-2014,01-07-2016,Analyst,21.0,44570,Finance,24.0,6.0,2.0,0
1,ANNIE,,F,,01-07-2016,Associate,,89207,Web,,13.0,,7
2,OLIVE,ANCY,F,7-28-2014,01-07-2016,Analyst,21.0,40955,Finance,23.0,7.0,3.0,0
3,CHERRY,AQUILAR,F,04-03-2013,01-07-2016,Analyst,22.0,45550,IT,22.0,8.0,3.0,0
4,LEON,ABOULAHOUD,M,11-20-2014,01-07-2016,Analyst,,43161,Operations,27.0,3.0,,3


In [65]:
# ## Preprocess the data 
# ## Drop irrelevant columns

data = data.drop(['FIRST NAME','LAST NAME','DOJ', 'CURRENT DATE','LEAVES USED','LEAVES REMAINING' ],axis=1)




choices = data['RATINGS'].dropna().unique()

data['RATINGS'] = data['RATINGS'].apply(
    lambda x: float(np.random.choice(choices)) if pd.isnull(x) else x
)


choices_age = data['AGE'].dropna().unique()

data['AGE'] = data['AGE'].apply(
    lambda x: float(np.random.choice(choices_age)) if pd.isnull(x) else x
)



data


Unnamed: 0,SEX,DESIGNATION,AGE,SALARY,UNIT,RATINGS,PAST EXP
0,F,Analyst,21.0,44570,Finance,2.0,0
1,F,Associate,45.0,89207,Web,3.0,7
2,F,Analyst,21.0,40955,Finance,3.0,0
3,F,Analyst,22.0,45550,IT,3.0,0
4,M,Analyst,29.0,43161,Operations,3.0,3
...,...,...,...,...,...,...,...
2634,F,Senior Manager,36.0,185977,Management,5.0,10
2635,F,Analyst,23.0,45758,IT,2.0,0
2636,F,Analyst,21.0,47315,Web,5.0,0
2637,F,Analyst,24.0,45172,Web,3.0,1


In [66]:
## Encode categorical variables

label_encoder_sex = LabelEncoder()
data['SEX'] = label_encoder_sex.fit_transform(data['SEX'])

data

Unnamed: 0,SEX,DESIGNATION,AGE,SALARY,UNIT,RATINGS,PAST EXP
0,0,Analyst,21.0,44570,Finance,2.0,0
1,0,Associate,45.0,89207,Web,3.0,7
2,0,Analyst,21.0,40955,Finance,3.0,0
3,0,Analyst,22.0,45550,IT,3.0,0
4,1,Analyst,29.0,43161,Operations,3.0,3
...,...,...,...,...,...,...,...
2634,0,Senior Manager,36.0,185977,Management,5.0,10
2635,0,Analyst,23.0,45758,IT,2.0,0
2636,0,Analyst,21.0,47315,Web,5.0,0
2637,0,Analyst,24.0,45172,Web,3.0,1


In [67]:
## Onehot encode Geography

from sklearn.preprocessing import OneHotEncoder

onehot_encoder_des = OneHotEncoder()

des_encoder = onehot_encoder_des.fit_transform(data[['DESIGNATION']]).toarray()

des_encoder

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [68]:
onehot_encoder_des.get_feature_names_out(['DESIGNATION'])

array(['DESIGNATION_Analyst', 'DESIGNATION_Associate',
       'DESIGNATION_Director', 'DESIGNATION_Manager',
       'DESIGNATION_Senior Analyst', 'DESIGNATION_Senior Manager'],
      dtype=object)

In [69]:
des_encoded_df = pd.DataFrame(des_encoder,columns=onehot_encoder_des.get_feature_names_out(['DESIGNATION']))

des_encoded_df

Unnamed: 0,DESIGNATION_Analyst,DESIGNATION_Associate,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2634,0.0,0.0,0.0,0.0,0.0,1.0
2635,1.0,0.0,0.0,0.0,0.0,0.0
2636,1.0,0.0,0.0,0.0,0.0,0.0
2637,1.0,0.0,0.0,0.0,0.0,0.0


In [70]:
data = pd.concat([data.drop('DESIGNATION',axis=1),des_encoded_df],axis=1)

data.head()

Unnamed: 0,SEX,AGE,SALARY,UNIT,RATINGS,PAST EXP,DESIGNATION_Analyst,DESIGNATION_Associate,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager
0,0,21.0,44570,Finance,2.0,0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,45.0,89207,Web,3.0,7,0.0,1.0,0.0,0.0,0.0,0.0
2,0,21.0,40955,Finance,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,45550,IT,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,1,29.0,43161,Operations,3.0,3,1.0,0.0,0.0,0.0,0.0,0.0


In [71]:
onehot_encoder_unit = OneHotEncoder()

unit_encoder = onehot_encoder_unit.fit_transform(data[['UNIT']]).toarray()

unit_encoder

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.]])

In [72]:
onehot_encoder_unit.get_feature_names_out(['UNIT'])

array(['UNIT_Finance', 'UNIT_IT', 'UNIT_Management', 'UNIT_Marketing',
       'UNIT_Operations', 'UNIT_Web'], dtype=object)

In [73]:
unit_encoded_df = pd.DataFrame(unit_encoder,columns=onehot_encoder_unit.get_feature_names_out(['UNIT']))

unit_encoded_df

Unnamed: 0,UNIT_Finance,UNIT_IT,UNIT_Management,UNIT_Marketing,UNIT_Operations,UNIT_Web
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
2634,0.0,0.0,1.0,0.0,0.0,0.0
2635,0.0,1.0,0.0,0.0,0.0,0.0
2636,0.0,0.0,0.0,0.0,0.0,1.0
2637,0.0,0.0,0.0,0.0,0.0,1.0


In [74]:
data = pd.concat([data.drop('UNIT',axis=1),unit_encoded_df],axis=1)

data.head()

Unnamed: 0,SEX,AGE,SALARY,RATINGS,PAST EXP,DESIGNATION_Analyst,DESIGNATION_Associate,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager,UNIT_Finance,UNIT_IT,UNIT_Management,UNIT_Marketing,UNIT_Operations,UNIT_Web
0,0,21.0,44570,2.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,45.0,89207,3.0,7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,21.0,40955,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,45550,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1,29.0,43161,3.0,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [75]:
## Save the encoder and scaler

with open('label_encoder_sex.pkl','wb') as file:
    pickle.dump(label_encoder_sex,file)

with open('onehot_encoder_des.pkl','wb') as file:
    pickle.dump(onehot_encoder_des,file)

with open('onehot_encoder_unit.pkl','wb') as file:
    pickle.dump(onehot_encoder_unit,file)

In [76]:
data.head()

Unnamed: 0,SEX,AGE,SALARY,RATINGS,PAST EXP,DESIGNATION_Analyst,DESIGNATION_Associate,DESIGNATION_Director,DESIGNATION_Manager,DESIGNATION_Senior Analyst,DESIGNATION_Senior Manager,UNIT_Finance,UNIT_IT,UNIT_Management,UNIT_Marketing,UNIT_Operations,UNIT_Web
0,0,21.0,44570,2.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,45.0,89207,3.0,7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,21.0,40955,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,22.0,45550,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1,29.0,43161,3.0,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [77]:
X = data.drop('SALARY', axis=1)
y = data['SALARY']

# ✅ Save the feature order (the exact column names & order)
import pickle
with open('feature_order.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)

# Then do scaling
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [78]:
X_train

array([[-0.98542127, -0.70803086, -1.32941042, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [-0.98542127, -0.19660121,  0.45923547, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [ 1.01479441,  0.57054326,  1.35355841, ..., -0.44581466,
        -0.44962652, -0.44045945],
       ...,
       [-0.98542127, -0.45231603,  0.45923547, ..., -0.44581466,
        -0.44962652,  2.27035657],
       [ 1.01479441, -0.45231603,  1.35355841, ..., -0.44581466,
        -0.44962652,  2.27035657],
       [-0.98542127, -0.45231603,  0.45923547, ...,  2.24308461,
        -0.44962652, -0.44045945]])

In [79]:
X_test

array([[-0.98542127,  2.87197666, -0.43508748, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [-0.98542127, -0.19660121, -0.43508748, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [-0.98542127, -0.19660121,  0.45923547, ..., -0.44581466,
        -0.44962652,  2.27035657],
       ...,
       [ 1.01479441,  0.05911361,  0.45923547, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [ 1.01479441,  0.05911361, -0.43508748, ..., -0.44581466,
        -0.44962652, -0.44045945],
       [-0.98542127, -0.70803086,  1.35355841, ..., -0.44581466,
        -0.44962652, -0.44045945]])

In [80]:
y_test

1322    179845
1185     48441
2572     40707
1709     43144
809      84967
         ...  
812      54166
544      55693
1278     42014
1199     45188
2257     43064
Name: SALARY, Length: 528, dtype: int64

# ANN IMPLEMENTATION

In [81]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore
#, Tensorboard
import datetime 

In [82]:
## Building our ANN Model

(X_train.shape[1],)

(16,)

In [83]:
## Build our ANN Model

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # No activation, because we are predicting a real number (salary)
])

In [84]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                1088      
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3201 (12.50 KB)
Trainable params: 3201 (12.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [85]:
import tensorflow 

opt = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()

loss

<keras.src.losses.BinaryCrossentropy at 0x20a07375630>

In [86]:
## compile the model

model.compile(optimizer='adam', loss='mean_squared_error')

In [87]:
## Train the model

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.5)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(X_train, y_train, epochs=800, batch_size=32, validation_split=0.2, callbacks=[early_stop, reduce_lr, checkpoint])
# history = model.fit(
#     X_train,y_train,validation_data=(X_test,y_test),epochs=100, callbacks=[early_stopping_callback]
# )

Epoch 1/800
Epoch 2/800
 1/53 [..............................] - ETA: 0s - loss: 5872109056.0000

  saving_api.save_model(


Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78/800
Epoch 79/800
Epoch 

In [88]:
model.save('model.h5') # .keras

In [89]:
# feature_order = list(X.columns)

# with open('feature_order.pkl', 'wb') as f:
#     pickle.dump(feature_order, f)
