# AutoML project 1



Project team: Oliver Püvi, Li Merila, Karolin Rips, Susanna Metsla, Annika Talvet

In [1]:
#!pip install hyperopt

In [55]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

## Reading in the data, preprocessing

In [3]:
df = pd.read_csv('archive/Employee Attrition.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
Emp ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,0.38,0.53,2.0,157.0,3.0,0.0,0.0,sales,low
2.0,0.8,0.86,5.0,262.0,6.0,0.0,0.0,sales,medium
3.0,0.11,0.88,7.0,272.0,4.0,0.0,0.0,sales,medium
4.0,0.72,0.87,5.0,223.0,5.0,0.0,0.0,sales,low
5.0,0.37,0.52,2.0,159.0,3.0,0.0,0.0,sales,low


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 15787 entries, 1.0 to 14999.0
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  float64
 3   average_montly_hours   14999 non-null  float64
 4   time_spend_company     14999 non-null  float64
 5   Work_accident          14999 non-null  float64
 6   promotion_last_5years  14999 non-null  float64
 7   dept                   14999 non-null  object 
 8   salary                 14999 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.2+ MB


There are 788 missing values in each column.

In [6]:
df2 = df.dropna(subset = ["satisfaction_level"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 14999 entries, 1.0 to 14999.0
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  float64
 3   average_montly_hours   14999 non-null  float64
 4   time_spend_company     14999 non-null  float64
 5   Work_accident          14999 non-null  float64
 6   promotion_last_5years  14999 non-null  float64
 7   dept                   14999 non-null  object 
 8   salary                 14999 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.1+ MB


Since all the values for those rows are missing, there is no need to impute them; we can simply drop them which is implemented in the code below. We also replace nominal variables (dept and salary) with dummy variables.

In [7]:
df = df.dropna()

encoded_df = pd.get_dummies(df, columns = ['dept', 'salary'])

encoded_df = encoded_df.astype('float64')

encoded_df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project           float64
average_montly_hours     float64
time_spend_company       float64
Work_accident            float64
promotion_last_5years    float64
dept_IT                  float64
dept_RandD               float64
dept_accounting          float64
dept_hr                  float64
dept_management          float64
dept_marketing           float64
dept_product_mng         float64
dept_sales               float64
dept_support             float64
dept_technical           float64
salary_high              float64
salary_low               float64
salary_medium            float64
dtype: object

In [None]:
scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(encoded_df)

encoded_df = pd.DataFrame(scaled_data, columns=encoded_df.columns)

In [10]:
encoded_df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept_IT,dept_RandD,dept_accounting,dept_hr,dept_management,dept_marketing,dept_product_mng,dept_sales,dept_support,dept_technical,salary_high,salary_low,salary_medium
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268,0.081805,0.05247,0.051137,0.04927,0.042003,0.057204,0.060137,0.276018,0.14861,0.181345,0.082472,0.487766,0.429762
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281,0.274077,0.222981,0.220284,0.216438,0.200602,0.232239,0.237749,0.447041,0.355715,0.385317,0.275092,0.499867,0.495059
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Baseline

To construct the baseline, we are trying a set of possible machine learning algorithms (13 algorithms) using their default hyperparamters and we choose the one with the highest performance for comparison (baseline model).

In [12]:
X = encoded_df.drop('satisfaction_level', axis = 1)
y = encoded_df['satisfaction_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso": Lasso(),
    "Elastic Net": ElasticNet(),
    "Bayesian Ridge Regression": BayesianRidge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "ExtraTrees Regressor": ExtraTreesRegressor(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Support Vector Regressor": SVR(),
    "Gaussian Process Regressor": GaussianProcessRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

for name, mse in results.items():
    print(f"{name}: MSE = {mse}")

baseline_model_name = min(results, key=results.get)
baseline_mse = results[baseline_model_name]

print(f"\nBaseline Model: {baseline_model_name} with MSE = {baseline_mse}")

Linear Regression: MSE = 0.05738498761975986
Ridge Regression: MSE = 0.05738517252173684
Lasso: MSE = 0.06110425235056756
Elastic Net: MSE = 0.06110425235056756
Bayesian Ridge Regression: MSE = 0.057390525920065216
Decision Tree: MSE = 0.05360571156481482
Random Forest: MSE = 0.030755876979083795
Gradient Boosting Regressor: MSE = 0.035652170209841405
AdaBoost Regressor: MSE = 0.04034675644080943
ExtraTrees Regressor: MSE = 0.031636772444557405
KNeighbors Regressor: MSE = 0.04275508666666667
Support Vector Regressor: MSE = 0.05217102513494156
Gaussian Process Regressor: MSE = 2.8273858640024203

Baseline Model: Random Forest with MSE = 0.030755876979083795


## Studying the potential pipeline structure

Based on the problem at hand, we study the potential pipeline structure,
algorithms or feature transformers at each step, hyper-parameters ranges. We are using hyperOpt with the potential search space to beat the baseline.

In [85]:
X = encoded_df.drop('satisfaction_level', axis=1)  # Features
y = encoded_df['satisfaction_level']               # Target

# Defining a joint search space for hyperparameter
space = {
    'regressor': hp.choice('regressor', [
        {
            'model': RandomForestRegressor,
            'n_estimators': hp.choice('n_estimators', [5, 10, 25, 50, 100, 200, 250, 300]),
            'max_depth': hp.choice('max_depth', [3, 5, 7, 10, 15, 20, None]),
            'min_samples_split': hp.choice('min_samples_split', [0.1, 0.25, 0.4, 0.55, 0.7, 0.85]),
            'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 5, 7, 10, 15, 20, 30]),
            'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
            'bootstrap': hp.choice('bootstrap', [True, False]),
            'max_leaf_nodes': hp.choice('max_leaf_nodes', [None, 3, 5, 10, 20, 30, 50, 100])
        },
        {
            'model': LinearRegression,
            'fit_intercept': hp.choice('fit_intercept', [True, False])
        }
    ]),

    'preprocessor': hp.choice('preprocessor', [
        {
            'standardscaler': StandardScaler,
        },
        {
            'minmaxscaler': MinMaxScaler,
        }
    ])
}

# Defining a function to optimize
def objective(params):
    # Extracting the regressor and preprocessor from the parameters
    regressor_params = params['regressor']
    preprocessor_params = params['preprocessor']

    # Creating a pipeline with the chosen regressor and preprocessor
    preprocessor = list(preprocessor_params.values())[0]()
    regressor_model = regressor_params['model']
    regressor_args = {k: v for k, v in regressor_params.items() if k != 'model'}
    regressor = regressor_model(**regressor_args)

    # Creating a pipeline with the chosen regressor and preprocessor
    pipeline = make_pipeline(preprocessor, regressor)

    # Evaluating the pipeline using cross-validation and mean squared error
    score = -cross_val_score(pipeline, X, y, cv=5, scoring = 'neg_mean_squared_error').mean()

    return {'loss': score, 'status': STATUS_OK}

# Using the fmin function from HyperOpt to find the optimal hyperparameters
trials = Trials()
best_params = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 40,
    trials = trials
)

# Converting indices back to hyperparameter values
parameter_values = space_eval(space, best_params)
best_preprocessor = list(parameter_values['preprocessor'].values())[0]()
best_regressor = parameter_values['regressor']['model']()
remove_model = parameter_values['regressor'].pop('model')
best_param_values = parameter_values['regressor']

# Final evaluation
best_pipeline = make_pipeline(best_preprocessor, best_regressor)
final_scores = cross_val_score(best_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
final_mse = -final_scores.mean()

# Results
print(f"Best preprocessor: {best_preprocessor}")
print(f"Best model: {best_regressor}")
print(f"Best hyperparameters: {best_param_values}")
print(f"Final Cross-Validated MSE: {final_mse}")

100%|██████████| 40/40 [01:48<00:00,  2.72s/trial, best loss: 0.0385248348047848]
Best preprocessor: StandardScaler()
Best model: RandomForestRegressor()
Best hyperparameters: {'bootstrap': False, 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': 30, 'min_samples_leaf': 10, 'min_samples_split': 0.1, 'n_estimators': 10}
Final Cross-Validated MSE: 0.028232419007511535


## Monitoring the performance of the constructed pipeline

Monitoring the the performance of the constructed pipeline from the previous step across different time budgets (number of iterations) and reporting the least time budget that you are able to outperform the baseline.

In [98]:
for trial in trials:
  trial_nr = trial['tid']+1
  pipeline_mse_trial = trial['result']['loss']
  if (pipeline_mse_trial < baseline_mse):
    print(f"Trial {trial_nr}: Pipeline MSE is SMALLER than baseline MSE.")
    break
  else:
    print(f"Trial {trial_nr}: Pipeline MSE is bigger than baseline MSE.")

Trial 1: Pipeline MSE is bigger than baseline MSE.
Trial 2: Pipeline MSE is bigger than baseline MSE.
Trial 3: Pipeline MSE is bigger than baseline MSE.
Trial 4: Pipeline MSE is bigger than baseline MSE.
Trial 5: Pipeline MSE is bigger than baseline MSE.
Trial 6: Pipeline MSE is bigger than baseline MSE.
Trial 7: Pipeline MSE is bigger than baseline MSE.
Trial 8: Pipeline MSE is bigger than baseline MSE.
Trial 9: Pipeline MSE is bigger than baseline MSE.
Trial 10: Pipeline MSE is bigger than baseline MSE.
Trial 11: Pipeline MSE is bigger than baseline MSE.
Trial 12: Pipeline MSE is bigger than baseline MSE.
Trial 13: Pipeline MSE is bigger than baseline MSE.
Trial 14: Pipeline MSE is bigger than baseline MSE.
Trial 15: Pipeline MSE is bigger than baseline MSE.
Trial 16: Pipeline MSE is bigger than baseline MSE.
Trial 17: Pipeline MSE is bigger than baseline MSE.
Trial 18: Pipeline MSE is bigger than baseline MSE.
Trial 19: Pipeline MSE is bigger than baseline MSE.
Trial 20: Pipeline MS

## Statistical test

Determining whether the difference in performance between the constructed pipeline and the baseline is statistically significant.

In [None]:
# Two-Matched-Samples t-test või McNemar's Test või Wilcoxon's Signed-Rank Test for matched pairs
# baseline ja eelmise sammu parima mudeli võrdlemiseks
# (kaks algoritmi ühe andmesstikuga)