In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
print ("Model Import Complete")

Model Import Complete


In [3]:
# Load dataset
url = "https://www.dropbox.com/scl/fi/zhzjr8qbh7f9orydgv1dw/chimera_data.csv?rlkey=apfdvomfm7guxmckfs4p0pzzj&dl=1"
chimera_data = pd.read_csv(url)

print(chimera_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18132 entries, 0 to 18131
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   admin_support          18132 non-null  int64  
 1   age                    18132 non-null  int64  
 2   boss_survey            18132 non-null  float64
 3   boss_tenure            18132 non-null  int64  
 4   city_size              18132 non-null  float64
 5   clock_in               18132 non-null  int64  
 6   core                   18132 non-null  int64  
 7   education              18132 non-null  int64  
 8   gender                 18132 non-null  int64  
 9   half_day_leaves        18132 non-null  int64  
 10  high_potential         18132 non-null  int64  
 11  job_satisfaction       18132 non-null  float64
 12  kpi_performance        18132 non-null  float64
 13  local                  18132 non-null  int64  
 14  part_time              18132 non-null  int64  
 15  ra

In [4]:
# Data Cleaning: Removing duplicates and checking for missing values
chimera_data = chimera_data.drop_duplicates()
chimera_data = chimera_data.dropna()


In [5]:
# Define features and target variable
X = chimera_data.drop(columns=["exit"])  # Features
y = chimera_data["exit"]  # Target variable



In [6]:
# Standardizing numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Training a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Training a Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Training a Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Training a Support Vector Machine model
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Training a K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Training an XGBoost model
xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [9]:
# Making predictions
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)


In [10]:
# Evaluating model performance
print("Random Forest Performance:\n", classification_report(y_test, rf_pred))
print("Logistic Regression Performance:\n", classification_report(y_test, lr_pred))
print("Gradient Boosting Performance:\n", classification_report(y_test, gb_pred))
print("Support Vector Machine Performance:\n", classification_report(y_test, svm_pred))
print("K-Nearest Neighbors Performance:\n", classification_report(y_test, knn_pred))
print("XGBoost Performance:\n", classification_report(y_test, xgb_pred))


Random Forest Performance:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      3136
           1       0.65      0.19      0.30       491

    accuracy                           0.88      3627
   macro avg       0.77      0.59      0.61      3627
weighted avg       0.85      0.88      0.85      3627

Logistic Regression Performance:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      3136
           1       0.65      0.24      0.35       491

    accuracy                           0.88      3627
   macro avg       0.77      0.61      0.64      3627
weighted avg       0.86      0.88      0.85      3627

Gradient Boosting Performance:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      3136
           1       0.59      0.23      0.34       491

    accuracy                           0.87      3627
   macro avg       0.74      0.60 

In [11]:
# Extracting feature importance for tree-based models
feature_importances_rf = rf_model.feature_importances_
feature_importances_gb = gb_model.feature_importances_
feature_importances_xgb = xgb_model.feature_importances_
feature_names = chimera_data.drop(columns=["exit"]).columns

importance_df_rf = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances_rf})
importance_df_rf = importance_df_rf.sort_values(by="Importance", ascending=False)

importance_df_gb = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances_gb})
importance_df_gb = importance_df_gb.sort_values(by="Importance", ascending=False)

importance_df_xgb = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances_xgb})
importance_df_xgb = importance_df_xgb.sort_values(by="Importance", ascending=False)

In [12]:
# Display feature importance
print("Random Forest Feature Importance:")
print(importance_df_rf)
print("Gradient Boosting Feature Importance:")
print(importance_df_gb)
print("XGBoost Feature Importance:")
print(importance_df_xgb)

Random Forest Feature Importance:
                  Feature  Importance
2             boss_survey    0.266440
11       job_satisfaction    0.103295
12        kpi_performance    0.089696
17                 salary    0.085076
1                     age    0.057792
23           variable_pay    0.049894
19              team_size    0.045108
9         half_day_leaves    0.033848
4               city_size    0.030302
24  years_since_promotion    0.023598
3             boss_tenure    0.023461
18           subordinates    0.022720
20                 tenure    0.019197
22               training    0.018423
7               education    0.016956
21            tenure_unit    0.015260
13                  local    0.013139
15                   rank    0.012342
5                clock_in    0.012171
16                 remote    0.011744
8                  gender    0.011300
6                    core    0.010717
14              part_time    0.010409
0           admin_support    0.009935
10         high_