In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import SCORERS, mean_squared_error, mean_absolute_error
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor

import numpy as np
from math import sqrt
pd.set_option("display.max_columns", 999)

### Load + preprocessing

In [2]:
# define Root MSE function
def rmse(x, y):
    return sqrt(mean_squared_error(x, y))

In [3]:
# load the dataset into a Pandas dataframe
df = pd.read_csv('data/attrition.csv')

# drop columns with 
df.drop(['EmployeeNumber', 'Attrition', 'Over18', 'StandardHours', 'EmployeeCount'], axis=1, inplace=True)

In [4]:
# lets make it some fun
df.drop('JobLevel', axis=1, inplace=True)

In [5]:
# select response variable and features
target_col_name = 'MonthlyIncome'
num_feature_cols = [
        'Age', 'DailyRate','DistanceFromHome', 'Education',
        'HourlyRate', 'EnvironmentSatisfaction', 'JobInvolvement',
        'JobSatisfaction', 'NumCompaniesWorked', 'PercentSalaryHike',
        'RelationshipSatisfaction', 'StockOptionLevel', 'PerformanceRating',
        'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
        'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
        'YearsWithCurrManager', 'MonthlyRate']
cat_feature_cols = [x for x in df.columns if x not in num_feature_cols and x not in [target_col_name]]

In [6]:
# cast numerical columns as float
for col in num_feature_cols:
    df[col] = df[col].astype(float)

In [7]:
# create target array and numeric features dataframe
df_target = np.ravel(df[[target_col_name]])
df_features = df[num_feature_cols]

In [8]:
# create target array and features dataframe
df_target = np.ravel(df[[target_col_name]])
df_features_all = df[num_feature_cols + cat_feature_cols]

# split the dataframe to train and test parts
X_train, X_test, y_train, y_test = train_test_split(df_features_all, df_target, test_size=0.3, random_state=666)

### Best performing regressors (based on $R^2$ and MSE) from class

In [9]:
num_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='median')),
                  ('scaler', RobustScaler())])

cat_transformer = Pipeline(steps=[
                  ('imputer', SimpleImputer(strategy='most_frequent')),
                  ('onehot', OneHotEncoder(categories='auto', 
                                     sparse=False, 
                                     handle_unknown='ignore'))])

pipeline_preprocess = ColumnTransformer(transformers=[
        ('numerical_preprocessing', num_transformer, num_feature_cols),
        ('categorical_preprocessing', cat_transformer, cat_feature_cols)],
        remainder='passthrough')

#### Tuned ElasticNet

In [10]:
pipe0 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", ElasticNet())])

param_grid = {'reg__l1_ratio': [.1, .5, .7, .9, .95, 1], 'reg__alpha':[0.2, 0.5, 1, 4, 10, 20, 40, 100]}
m4 = GridSearchCV(estimator=pipe0, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m4_mse = rmse(m4.predict(X_test), y_test)
m4_mae = mean_absolute_error(m4.predict(X_test), y_test)
m4_r2 = m4.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m4_mse} \n")
print(f"Model mae: {m4_mae} \n")
print(f"Model R2: {m4_r2} \n")
print(f"Model coefs: {[int(x) for x in m4.best_estimator_.named_steps['reg'].coef_]} \n")
print(f"Model hyper_params: {m4.best_params_}")

Model mse: 1683.6212040893834 

Model mae: 1333.1106306639845 

Model R2: 0.8686077406910069 

Model coefs: [-113, 31, 0, -82, -16, 16, -65, 81, 28, 58, 103, 0, -269, 1802, -14, 0, 181, -66, 168, -292, -136, 0, 0, 0, 0, 381, 0, 0, 0, 0, -93, 0, 0, -65, 0, 0, -1297, -2852, 7648, 110, 6787, -2923, 408, -2451, -76, 0, 0, -6, 0] 

Model hyper_params: {'reg__alpha': 10, 'reg__l1_ratio': 1}


#### Ensemble models - Random Forests

In [11]:
pipe2 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", RandomForestRegressor())])

param_grid = {'reg__max_depth': [5, 10, 15], 'reg__n_estimators':[40, 100, 150], 'reg__max_features':['auto', 'sqrt']}
m6 = GridSearchCV(estimator=pipe2, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m6_mse = rmse(m6.predict(X_test), y_test)
m6_mae = mean_absolute_error(m6.predict(X_test), y_test)
m6_r2 = m6.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m6_mse} \n")
print(f"Model mae: {m6_mae} \n")
print(f"Model R2: {m6_r2} \n")
print(f"Model hyper_params: {m6.best_params_}")

Model mse: 1611.7270100233652 

Model mae: 1232.3738221746291 

Model R2: 0.8795896066643341 

Model hyper_params: {'reg__max_depth': 10, 'reg__max_features': 'auto', 'reg__n_estimators': 150}


#### Ensemble models - Gradient Boosting

In [12]:
pipe3 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", GradientBoostingRegressor())])

param_grid = {'reg__max_features': ['auto', 'sqrt'], 'reg__subsample': [0.1, 0.05, 0.4], 'reg__min_samples_leaf': [0.0025, 0.005, 0.01, 0.05, 0.1], 'reg__n_estimators':[30, 40, 50, 70, 200]}
m7 = GridSearchCV(estimator=pipe3, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m7_mse = rmse(m7.predict(X_test), y_test)
m7_mae = mean_absolute_error(m7.predict(X_test), y_test)
m7_r2 = m7.best_estimator_.score(X_test, y_test)

print(f"Model mse: {m7_mse} \n")
print(f"Model mae: {m7_mae} \n")
print(f"Model R2: {m7_r2} \n")
print(f"Model hyper_params: {m7.best_params_}")

Model mse: 1679.1742128032363 

Model mae: 1302.094636683161 

Model R2: 0.8693009234179803 

Model hyper_params: {'reg__max_features': 'auto', 'reg__min_samples_leaf': 0.005, 'reg__n_estimators': 50, 'reg__subsample': 0.4}


# YOUR QUEST: Get better R2 and MSE then this ^^

### XGBoost

In [13]:
#original param_grid
param_grid = {'reg__max_depth': [3, 5, 10], 
              'reg__n_estimators': [50, 55, 60],
              'reg__reg_alpha': [0.1, 0.2, 0.3],
              'reg__reg_lambda': [0.1, 0.2, 0.3],
             }
#tried even more extreme values

Chosen hyperparameters: `max_depth = 3`, `n_estimators = 50`, `reg_alpha = 0.1`, `reg_lambda = 0.2`.

In [14]:
pipe01 = Pipeline([("transform_inputs", pipeline_preprocess), ("reg", XGBRegressor(objective ='reg:squarederror'))])

param_grid = {'reg__max_depth': [3], 
              'reg__n_estimators': [50],
              'reg__reg_alpha': [0.1],
              'reg__reg_lambda': [0.2],
             }
m01 = GridSearchCV(estimator=pipe01, param_grid=param_grid, cv=5, scoring='r2', iid=False).fit(X_train, y_train)

m01_mse = rmse(m01.predict(X_test), y_test)
m01_mae = mean_absolute_error(m01.predict(X_test), y_test)
m01_r2 = m01.best_estimator_.score(X_test, y_test)

In [15]:
print(f"Model mse: {m01_mse} \n")
print(f"Model mae: {m01_mae} \n")
print(f"Model R2: {m01_r2} \n")
print(f"Model hyper_params: {m01.best_params_}")

Model mse: 1551.8630327365893 

Model mae: 1194.3279102005386 

Model R2: 0.8883682372600944 

Model hyper_params: {'reg__max_depth': 3, 'reg__n_estimators': 50, 'reg__reg_alpha': 0.1, 'reg__reg_lambda': 0.2}


### Catboost does not perform that well