## Imports / Setup

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()


✨🍰✨ Everything looks OK!


In [None]:
!mamba create -n autosklearn-env -c conda-forge python=3.9 jupyterlab swig cmake -y
!mamba run -n autosklearn-env pip install auto-sklearn


Looking for: ['python=3.9', 'jupyterlab', 'swig', 'cmake']

conda-forge/linux-64                                        Using cache
conda-forge/noarch                                          Using cache
Transaction

  Prefix: /usr/local/envs/autosklearn-env

  Updating specs:

   - python=3.9
   - jupyterlab
   - swig
   - cmake


  Package                                 Version  Build               Channel           Size
───────────────────────────────────────────────────────────────────────────────────────────────
  Install:
───────────────────────────────────────────────────────────────────────────────────────────────

  [32m+ python_abi                   [0m             3.9  8_cp39              conda-forge        7kB
  [32m+ tzdata                       [0m           2025b  h78e105d_0          conda-forge      123kB
  [32m+ ca-certificates              [0m       2025.7.14  hbd8a1cb_0          conda-forge      156kB
  [32m+ ld_impl_linux-64             [0m            2.44

In [None]:
!mamba run -n autosklearn-env python -c "import autosklearn.classification; print('Auto-Sklearn Installed!')"


  import pkg_resources
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/envs/autosklearn-env/lib/python3.9/site-packages/autosklearn/classification.py", line 1, in <module>
    from autosklearn.estimators import AutoSklearnClassifier  # noqa (imported but unused)
  File "/usr/local/envs/autosklearn-env/lib/python3.9/site-packages/autosklearn/estimators.py", line 23, in <module>
    from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
  File "/usr/local/envs/autosklearn-env/lib/python3.9/site-packages/ConfigSpace/__init__.py", line 37, in <module>
    from ConfigSpace.configuration_space import Configuration, \
  File "ConfigSpace/configuration_space.pyx", line 40, in init ConfigSpace.configuration_space
  File "ConfigSpace/hyperparameters.pyx", line 1, in init ConfigSpace.hyperparameters
ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

ERROR co

In [None]:
import pandas as pd
import kagglehub
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

from autosklearn.classification import AutoSklearnClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import tabulate
import seaborn as sns
import time


ModuleNotFoundError: No module named 'autosklearn'

In [None]:

# Download latest version
path = kagglehub.dataset_download("pavansubhasht/ibm-hr-analytics-attrition-dataset")

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv(path + "/WA_Fn-UseC_-HR-Employee-Attrition.csv")
pd.set_option('display.max_columns', None)

## Data Preprocessing

In [None]:
# Check for missing / invalid values
print(df.isna().sum().sum())
print(df.isnull().sum().sum())

In [None]:
df

In [None]:
## Lets analyze some relations to gain insight between some protected class parameters and attrition

def plot_attrition(column):

  attrition_counts = pd.crosstab(df[column], df['Attrition'])
  attrition_percentages = attrition_counts.div(attrition_counts.sum(axis=1), axis=0)
  ax = attrition_percentages.plot(kind='bar', stacked=True, figsize=(10, 5))
  for container in ax.containers:
      labels = [f'{w:.1%}' if (w := v.get_height()) > 0 else '' for v in container]
      ax.bar_label(container, labels=labels, label_type='center')


  plt.title(column + ' vs Attrition')
  plt.xlabel(column)
  plt.xticks(rotation = 45)
  plt.ylabel('Attrition')
  plt.show()


plot_attrition('BusinessTravel')

In [None]:
plot_attrition('Department')

In [None]:
plot_attrition('EducationField')

In [None]:
plot_attrition('JobRole')

In [None]:
plot_attrition('MaritalStatus')

In [None]:
plot_attrition('Gender')

In [None]:
# One-Hot Encode Categorical Variables

df = df.join(pd.get_dummies(df['BusinessTravel'], prefix = 'BusinessTravel')).drop('BusinessTravel', axis = 1)
df = df.join(pd.get_dummies(df['Department'], prefix = 'Department')).drop('Department', axis = 1)
df = df.join(pd.get_dummies(df['EducationField'], prefix = 'Education')).drop('EducationField', axis = 1)
df = df.join(pd.get_dummies(df['JobRole'], prefix = 'Job')).drop('JobRole', axis = 1)
df = df.join(pd.get_dummies(df['MaritalStatus'])).drop('MaritalStatus', axis = 1)

In [None]:
# Binary Encode

df['Attrition'].replace({'Yes': 1, 'No': 0}, inplace = True)
df['Gender'].replace({'Male': 1, 'Female': 0}, inplace = True)
df['Over18'].replace({'Y': 1, 'N': 0}, inplace = True)
df['OverTime'].replace({'Yes': 1, 'No': 0}, inplace = True)

In [None]:
# Binary Encode the One-Hot Encoded Parameters

df.replace({True: 1, False: 0}, inplace = True)

In [None]:
df.hist(figsize = (20, 20))

In [None]:
# Drop unncessary columns

df = df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis = 1) # Entire column has the same value
df = df.drop(['EmployeeNumber'], axis = 1) # Can be used as an identifier but we are using dataset indexing
df

In [None]:
# Standardize the dataset

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df

In [None]:
corr_matrix=df.corr()
stacked_corr = corr_matrix.stack()
filtered_corr = stacked_corr[abs(stacked_corr) > 0.95]
filtered_corr = filtered_corr[filtered_corr.index.get_level_values(0) != filtered_corr.index.get_level_values(1)]

print("Combinations with correlation greater than 0.94 (absolute value):")
filtered_corr


In [None]:
fig,ax=plt.subplots(figsize=(15,10))
ax=sns.heatmap(corr_matrix,
               linewidths=0.2,
               fmt=".1f"
              )
df = df.drop(['JobLevel'], axis = 1) # Remove extremely correlated columns

## Training / Testing

In [None]:
def training_testing(df, model, batch_size ,title, scores_title, verbose = False):
  # Split the dataset

  X, y = df.drop('Attrition', axis = 1), df['Attrition']

  model = model
  start = time.time()
  test = []
  train = []

  res = 0
  optimal_model = None
  for i in range(batch_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test.append(model.score(X_test, y_test))
    train.append(model.score(X_train, y_train))


  end = time.time()
  mean_accuracy = sum(test) / batch_size
  if mean_accuracy > res and mean_accuracy > 0.88:
    res = mean_accuracy
    optimal_model = model
    print(optimal_model, f"Accuracy: {res}")


  if verbose == True:
    print("############################ " + title + " ############################ \n")
    print('Total Training Time: ' + str(end - start))
    print(classification_report(y_test, y_pred))
    print('Mean Train Accuracy: ' , sum(train) / batch_size)
    print('Mean Test Accuracy: ' , mean_accuracy)
    print("########################################################################################################################## \n")
  # sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap = 'Blues', fmt = 'g')
  # plt.title(title)
  # plt.show()
  return model, {scores_title: mean_accuracy}, optimal_model


In [None]:
# Random Forest Classifier
rt_clf = RandomForestClassifier(n_estimators = 150, n_jobs = -1)




# Decision Tree Classifier
dt_clf = DecisionTreeClassifier()




# Logistic Regression Classifier
log_reg = LogisticRegression('l2', tol = 0.01,  C = 10, solver = 'liblinear')
      # tol = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1] # Ideal tol = 0.01
      # C = [0.01, 0.1, 1, 10, 100] # Ideal C = 10




# K Nearest Neighbors
knn_clf = KNeighborsClassifier(16, weights = 'distance', n_jobs = -1, p = 1)
      # k = list(range(1, 20)) # Ideal - K = 16




# Support Vector Machine
svm_clf = svm.SVC(C = 50, degree = 1, tol = 1, kernel = 'poly')
      # c = [0.001, 0.01, 0.1, 1, 10, 100] # Ideal C = 50
      # degree = list(range(1,10)) # Ideal degree = 1
      # tol = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1] # Ideal tol = 1



# Gaussian Bernoulli Distribution Classifier
nb_clf = GaussianNB(var_smoothing  = 1)
      # var_s = [(10 * 10**(-x)) for x in range(11)] # Ideal var_smoothing = 1



In [None]:
# Custom Validation and optimal parameters


batch_size = 20
scores = pd.DataFrame()


rtc_model, rtc_accuracy, optimal_model = training_testing(df, rt_clf , batch_size, "All Features - Random Forest Classifier", "Random_Forest", )
dtc_model, dtc_accuracy, optimal_model = training_testing(df, dt_clf, batch_size, "All Features - Decision Tree Classifier", "Decision_Tree", )
lr_model, lr_accuracy, optimal_model = training_testing(df, log_reg, batch_size, "All Features - Logistic Regression - Tol: 0.01 - C: 10" , "Logisitic_Regression", )
knn_model, knn_accuracy, optimal_model = training_testing(df, knn_clf, batch_size, "All Features - KNN - K = 16", "KNN",)
svm_model, svm_accuracy, optimal_model = training_testing(df, svm_clf, batch_size, f"All Features - SVM - C: {50} - Degree: {1} - Tol: {1}", "SVM", )
nb_model, nb_accuracy, optimal_model = training_testing(df, nb_clf, batch_size, f"All Features - Naive Bayes - var_smooting = 1", "Naive_Bayes", )









In [None]:
## AUTOML
resample_s = {'train_size': 0.8,
              'shuffle': True,
              'folds': 5}
automl_clf = AutoSklearnClassifier(time_left_for_this_task = 180, per_run_time_limit = 30, ensemble_size = 5,
                                   resampling_strategy = 'cv-iterative-fit', resampling_strategy_arguments = resample_s,
                                   n_jobs = -1)



In [None]:
important_features = pd.Series(rt_clf.feature_importances_, index=X.columns)
important_features.nlargest(30).plot(kind='barh')
important_features.nlargest(30).index

In [None]:
## Train only using the most impactful features

mi_df = df[important_features.nlargest(30).index].join(df['Attrition'])
mi_df


## Split the dataset

mi_X, mi_y = mi_df.drop('Attrition', axis = 1), mi_df['Attrition']
mi_X_train, mi_X_test, mi_y_train, mi_y_test = train_test_split(mi_X, mi_y, test_size = 0.2)


## Retrain and check accuracy

mi_rt_clf = training_testing(mi_X_train, mi_y_train, mi_X_test, mi_y_test, RandomForestClassifier(n_estimators = 150, n_jobs = -1), batch_size, "Preprocessed Dataset - Most Important Features")

In [None]:
df.shape