In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt #to plot
import seaborn as sns #to plot

print ("Model Import Complete")

Model Import Complete


In [34]:
# Load dataset
url = "https://www.dropbox.com/scl/fi/zhzjr8qbh7f9orydgv1dw/chimera_data.csv?rlkey=apfdvomfm7guxmckfs4p0pzzj&dl=1"
chimera_data = pd.read_csv(url)

print(chimera_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18132 entries, 0 to 18131
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   admin_support          18132 non-null  int64  
 1   age                    18132 non-null  int64  
 2   boss_survey            18132 non-null  float64
 3   boss_tenure            18132 non-null  int64  
 4   city_size              18132 non-null  float64
 5   clock_in               18132 non-null  int64  
 6   core                   18132 non-null  int64  
 7   education              18132 non-null  int64  
 8   gender                 18132 non-null  int64  
 9   half_day_leaves        18132 non-null  int64  
 10  high_potential         18132 non-null  int64  
 11  job_satisfaction       18132 non-null  float64
 12  kpi_performance        18132 non-null  float64
 13  local                  18132 non-null  int64  
 14  part_time              18132 non-null  int64  
 15  ra

In [35]:
# Data Cleaning: Removing duplicates and checking for missing values
chimera_data = chimera_data.drop_duplicates()
chimera_data = chimera_data.dropna()


In [36]:
# Dropping the 'half_day_leaves' column
chimera_data = chimera_data.drop(columns=["half_day_leaves"], errors='ignore')


In [37]:
# Define features and target variable
X = chimera_data.drop(columns=["exit"])  # Features
y = chimera_data["exit"]  # Target variable


In [38]:
# Standardizing numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [39]:
# Apply Lasso Regression to remove unnecessary features
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)
selected_features = np.where(lasso.coef_ != 0)[0]  # Indices of selected features
X_selected = X.iloc[:, selected_features]  # Select only relevant features

In [40]:
# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

In [41]:
# Training a Ridge Classifier
ridge_model = RidgeClassifier(alpha=1.0)
ridge_model.fit(X_train, y_train)


In [42]:
# Training a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Training a Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Training a Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Training a Support Vector Machine model
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Training a K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Training an XGBoost model
xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [43]:
# Making predictions
ridge_pred = ridge_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
lr_pred = lr_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
svm_pred = svm_model.predict(X_test)
knn_pred = knn_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

In [44]:
# Evaluating model performance
print("Ridge Classifier Performance:\n", classification_report(y_test, ridge_pred))
print("Random Forest Performance:\n", classification_report(y_test, rf_pred))
print("Logistic Regression Performance:\n", classification_report(y_test, lr_pred))
print("Gradient Boosting Performance:\n", classification_report(y_test, gb_pred))
print("Support Vector Machine Performance:\n", classification_report(y_test, svm_pred))
print("K-Nearest Neighbors Performance:\n", classification_report(y_test, knn_pred))
print("XGBoost Performance:\n", classification_report(y_test, xgb_pred))

Ridge Classifier Performance:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93      3136
           1       0.93      0.03      0.06       491

    accuracy                           0.87      3627
   macro avg       0.90      0.51      0.49      3627
weighted avg       0.88      0.87      0.81      3627

Random Forest Performance:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      3136
           1       0.60      0.20      0.30       491

    accuracy                           0.87      3627
   macro avg       0.74      0.59      0.61      3627
weighted avg       0.85      0.87      0.84      3627

Logistic Regression Performance:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93      3136
           1       0.66      0.22      0.33       491

    accuracy                           0.88      3627
   macro avg       0.77      0.60  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
# Extracting feature importance for tree-based models
feature_importances_rf = rf_model.feature_importances_
feature_importances_gb = gb_model.feature_importances_
feature_importances_xgb = xgb_model.feature_importances_
selected_feature_names = X.columns[selected_features]

importance_df_rf = pd.DataFrame({"Feature": selected_feature_names, "Importance": feature_importances_rf})
importance_df_rf = importance_df_rf.sort_values(by="Importance", ascending=False)

importance_df_gb = pd.DataFrame({"Feature": selected_feature_names, "Importance": feature_importances_gb})
importance_df_gb = importance_df_gb.sort_values(by="Importance", ascending=False)

importance_df_xgb = pd.DataFrame({"Feature": selected_feature_names, "Importance": feature_importances_xgb})
importance_df_xgb = importance_df_xgb.sort_values(by="Importance", ascending=False)


In [46]:
# Display feature importance
print("Random Forest Feature Importance:")
print(importance_df_rf)
print("Gradient Boosting Feature Importance:")
print(importance_df_gb)
print("XGBoost Feature Importance:")
print(importance_df_xgb)

Random Forest Feature Importance:
                  Feature  Importance
1             boss_survey    0.290238
7        job_satisfaction    0.134191
8         kpi_performance    0.116878
13                 salary    0.111029
0                     age    0.071269
2               city_size    0.038920
14           subordinates    0.030308
18  years_since_promotion    0.027796
15                 tenure    0.024059
17               training    0.022481
16            tenure_unit    0.019405
11                   rank    0.017887
9                   local    0.016438
3                clock_in    0.015260
12                 remote    0.014219
4                    core    0.013891
10              part_time    0.013397
5                  gender    0.013241
6          high_potential    0.009096
Gradient Boosting Feature Importance:
                  Feature  Importance
1             boss_survey    0.857133
7        job_satisfaction    0.043055
8         kpi_performance    0.032881
13              

In [49]:
# Feature Importance Visualization (For Tree-Based Models)
def plot_feature_importance(model, model_name):
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        feature_names = chimera_data.drop(columns=["exit"]).columns

        #  # Check if lengths match
        # if len(feature_names) != len(importance):
        #     raise ValueError("Length of feature_names and importance do not match")
        
        importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importance})
        importance_df = importance_df.sort_values(by="Importance", ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], palette="viridis")
        plt.title(f"Feature Importance - {model_name}")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.show()

models = {
    "Ridge Classifier": ridge_model,
    "Random Forest": rf_model,
    "Logistic Regression": lr_model,
    "Gradient Boosting": gb_model,
    "Support Vector Machine": svm_model,
    "K-Nearest Neighbors": knn_model,
    "XGBoost": xgb_model
}

for name, model in models.items():
    plot_feature_importance(model, name)

ValueError: All arrays must be of the same length