In [1]:
#!pip install hyperopt

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import warnings
warnings.filterwarnings('ignore')

## Reading in the data, preprocessing

In [3]:
df= pd.read_csv('archive/Employee Attrition.csv', index_col=0)
df = df.dropna(thresh=4)
df.head()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
Emp ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,0.38,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
2.0,0.8,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
3.0,0.11,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
4.0,0.72,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
5.0,0.37,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [4]:
encoded_df = pd.get_dummies(df, columns=['dept', 'salary'])

encoded_df = encoded_df.astype('float64')

scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(encoded_df)

encoded_df = pd.DataFrame(scaled_data, columns=encoded_df.columns)

encoded_df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project           float64
average_montly_hours     float64
time_spend_company       float64
Work_accident            float64
promotion_last_5years    float64
dept_IT                  float64
dept_RandD               float64
dept_accounting          float64
dept_hr                  float64
dept_management          float64
dept_marketing           float64
dept_product_mng         float64
dept_sales               float64
dept_support             float64
dept_technical           float64
salary_high              float64
salary_low               float64
salary_medium            float64
dtype: object

## Baseline

In [5]:


X = encoded_df.drop('satisfaction_level', axis=1)
y = encoded_df['satisfaction_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector Regressor": SVR()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

for name, mse in results.items():
    print(f"{name}: MSE = {mse}")

baseline_model_name = min(results, key=results.get)
baseline_mse = results[baseline_model_name]

print(f"\nBaseline Model: {baseline_model_name} with MSE = {baseline_mse}")


Linear Regression: MSE = 0.06929717137997814
Decision Tree: MSE = 0.06656922993063101
Random Forest: MSE = 0.03737279179514238
Support Vector Regressor: MSE = 0.05027313125030984

Baseline Model: Random Forest with MSE = 0.03737279179514238


## Trial

In [6]:
X = encoded_df.drop('satisfaction_level', axis=1)  # Features
y = encoded_df['satisfaction_level']               # Target

space = {
    'n_estimators': hp.choice('n_estimators', [5, 10, 25, 50, 100, 200, 250, 300, 350, 400]),
    'max_depth': hp.choice('max_depth', [3, 5, 10, 20, 25, 30, 40, None]),
    'min_samples_split': hp.choice('min_samples_split', [1, 2, 5, 7, 10, 15, 20, 30]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 5, 7, 10, 15, 20, 30]),
    'max_features': hp.choice('max_features', ['auto', 'sqrt']),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', [None, 3, 5, 10, 20, 30, 50, 100])
}

def objective(params):
    model = RandomForestRegressor(**params)
    score = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
    return {'loss': score, 'status': STATUS_OK}

trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

best_params_converted = {
    'n_estimators': [5, 10, 25, 50, 100, 200, 250, 300, 350, 400][best_params['n_estimators']],
    'max_depth': [3, 5, 10, 20, 25, 30, 40, None][best_params['max_depth']],
    'min_samples_split': [1, 2, 5, 7, 10, 15, 20, 30][best_params['min_samples_split']],
    'min_samples_leaf': [1, 2, 5, 7, 10, 15, 20, 30][best_params['min_samples_leaf']],
    'max_features': ['auto', 'sqrt'][best_params['max_features']],
    'bootstrap': [True, False][best_params['bootstrap']],
    'max_leaf_nodes': [None, 3, 5, 10, 20, 30, 50, 100][best_params['max_leaf_nodes']]
}

best_model = RandomForestRegressor(**best_params_converted)
best_model.fit(X, y)

final_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
final_mse = -final_scores.mean()

print(f"Best Hyperparameters: {best_params_converted}")
print(f"Final Cross-Validated MSE: {final_mse}")


100%|██████| 100/100 [02:44<00:00,  1.64s/trial, best loss: 0.03846289528928744]
Best Hyperparameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 15, 'min_samples_leaf': 7, 'max_features': 'auto', 'bootstrap': True, 'max_leaf_nodes': None}
Final Cross-Validated MSE: 0.03847167233587463
