In [12]:
import pandas as pd

### Synthetic data generation

In [13]:
df = pd.read_csv('employee_data.csv')
df = df.drop(columns=['Employee_ID', 'Hire_Date'])
# one hot endoding of the categorical features
categorical_features= ['Department', 'Job_Title', 'Gender', 'Education_Level']
df = pd.get_dummies(df, columns=categorical_features)
# rename 'Performance_Score' column to 'target'
df.rename(columns={'Performance_Score': 'target'}, inplace=True)
# rename the features to `featX` where X is the feature number when the column name is not 'target'
feature_counter = 1
new_columns = {}
col_names = []
for col in df.columns:
    if col == 'target':
        new_columns[col] = col  # Keep 'target' as is
    else:
        new_columns[col] = f'feat{feature_counter}'
        col_names.append(f'feat{feature_counter}')
        feature_counter += 1
df.rename(columns=new_columns, inplace=True)
df = df[col_names+['target']]
# finally make booleans values 0 and 1
df.replace({True: 1, False: 0}, inplace=True)
df.to_csv('dataset.csv', index=False)
df.head()

  df.replace({True: 1, False: 0}, inplace=True)


Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat28,feat29,feat30,feat31,feat32,feat33,feat34,feat35,feat36,target
0,55,2,6750.0,33,32,22,2,0,14,66,...,1,0,0,1,0,0,1,0,0,5
1,29,0,7500.0,34,34,13,14,100,12,61,...,0,0,0,1,0,0,1,0,0,5
2,55,8,5850.0,37,27,6,3,50,10,1,...,1,0,0,1,0,0,1,0,0,3
3,48,7,4800.0,52,10,28,12,100,10,0,...,0,0,1,0,0,1,0,0,0,2
4,36,3,4800.0,38,11,29,13,100,15,9,...,0,0,1,0,0,1,0,0,0,2


### Train XGboost model on the dataset. 

In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [15]:

X = df.drop('target', axis=1)
y = df['target']

feature_names = X.columns.tolist()
target_name = 'target'

# Remap target classes to start from 0 (XGBoost requirement)
class_mapping = {int(cls): int(idx) for idx, cls in enumerate(sorted(y.unique()))}
y_remapped = y.map(class_mapping)
y = y_remapped


In [16]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1111, stratify=y
)

In [17]:
# Normalize the data 
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Train XGBoost
xgb_params = {
    'objective': 'multi:softprob',  # for multi-class classification
    'num_class': len(y.unique()),   # number of classes
    'max_depth': 5,
    'learning_rate': 0.11,
    'n_estimators': 11,
    'random_state': 1111,
    'eval_metric': 'mlogloss'
}

model = xgb.XGBClassifier(**xgb_params)
model.fit(X_train_scaled, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [19]:
# predict
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

Model Accuracy: 0.9427

Classification Report:
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      8048
           1       1.00      0.86      0.92      8005
           2       1.00      0.86      0.92      8000
           3       1.00      1.00      1.00      7976
           4       1.00      1.00      1.00      7971

    accuracy                           0.94     40000
   macro avg       0.96      0.94      0.94     40000
weighted avg       0.96      0.94      0.94     40000


Confusion Matrix:
[[8048    0    0    0    0]
 [1148 6857    0    0    0]
 [1142    0 6858    0    0]
 [   0    0    0 7976    0]
 [   0    0    0    0 7971]]


In [20]:
class_mapping

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}

In [21]:
import json

# save model
model_filename = 'xgboost_model.pkl'
scaler_filename = 'scaler.pkl'
metadata_filename = 'model_metadata.json'

joblib.dump(model, model_filename)
print(f"Model saved as: {model_filename}")

# save scaler
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved as: {scaler_filename}")

# save metadata
metadata = {
    'feature_names': feature_names,
    'target_name': target_name,
    'num_features': len(feature_names),
    'num_classes': len(y.unique()),
    'class_labels': sorted(y.unique().tolist()),
    'class_mapping': json.dumps(class_mapping)
}

with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2)


Model saved as: xgboost_model.pkl
Scaler saved as: scaler.pkl
