In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#  [Indian Liver Patient Dataset](https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset))

In [2]:
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

In [74]:
liver = pd.read_csv('./liver.csv')
liver.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [75]:
liver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [76]:
np.sum(liver.isna()) / len(liver)
# <5% ok to drop

Age                           0.000000
Gender                        0.000000
Total_Bilirubin               0.000000
Direct_Bilirubin              0.000000
Alkaline_Phosphotase          0.000000
Alamine_Aminotransferase      0.000000
Aspartate_Aminotransferase    0.000000
Total_Protiens                0.000000
Albumin                       0.000000
Albumin_and_Globulin_Ratio    0.006861
Dataset                       0.000000
dtype: float64

In [77]:
liver.dropna(inplace=True)
liver.reset_index(drop=True, inplace=True)

In [78]:
liver_ = liver.copy()
liver_.columns.values

array(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'], dtype=object)

In [79]:
gender = liver_['Gender'].map({'Female': 0, 'Male': 1})
liver_.drop(columns=['Gender', 'Dataset'], inplace=True)
liver_ = scale(liver_)

In [80]:
cols = ['Age_std', 'Total_Bilirubin_std', 'Direct_Bilirubin_std',
        'Alkaline_Phosphotase_std', 'Alamine_Aminotransferase_std',
        'Aspartate_Aminotransferase_std', 'Total_Proteins_std', 'Albumin_std',
        'Albumin_and_Globulin_Ratio_std']
liver_prepped = pd.DataFrame(liver_, columns=cols)
liver_prepped['is_male'] = gender.astype('int')
liver_prepped.head()

Unnamed: 0,Age_std,Total_Bilirubin_std,Direct_Bilirubin_std,Alkaline_Phosphotase_std,Alamine_Aminotransferase_std,Aspartate_Aminotransferase_std,Total_Proteins_std,Albumin_std,Albumin_and_Globulin_Ratio_std,is_male
0,1.247403,-0.42032,-0.495414,-0.42887,-0.355832,-0.319111,0.293722,0.203446,-0.14739,0
1,1.062306,1.218936,1.423518,1.675083,-0.093573,-0.035962,0.939655,0.077462,-0.648461,1
2,1.062306,0.640375,0.926017,0.816243,-0.115428,-0.146459,0.478274,0.203446,-0.178707,1
3,0.815511,-0.372106,-0.388807,-0.449416,-0.36676,-0.312205,0.293722,0.329431,0.16578,1
4,1.679294,0.093956,0.179766,-0.395996,-0.295731,-0.177537,0.755102,-0.930414,-1.713237,1


In [81]:
X = liver_prepped.copy()
y = liver['Dataset'] - 1#this col corresponds to whether a patient as liver disease or not

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=3)

In [83]:
X_train.shape

(463, 10)

In [84]:
X_test.shape

(116, 10)

## AdaBoost (Adaptive Boost) Classifier

In [42]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.metrics import roc_auc_score

In [43]:
# initiate tree classifier
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# initiate ada
ada = AdaBoostClassifier(base_estimator=dt,
                         n_estimators=180,
                         random_state=1)

In [44]:
# fit and compute probabilities of obtaining positive cases
ada.fit(X_train, y_train)

y_pred_proba = ada.predict_proba(X_test)[:, 1]

In [45]:
# eval with roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC AUC score: {ada_roc_auc: .2f}')

ROC AUC score:  0.72


## Hyperparameter Tuning

In [71]:
from sklearn.model_selection import GridSearchCV

In [85]:
dt = DecisionTreeClassifier(random_state=1)

In [86]:
dt.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 1,
 'splitter': 'best'}

In [87]:
params_dt = {'max_depth': [2, 3, 4],
             'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]}

In [88]:
# initiate search object
grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)

In [89]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1), n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_leaf': [0.12, 0.14, 0.16, 0.18]},
             scoring='roc_auc')

In [90]:
# extract best estimator and evaluate
best_model = grid_dt.best_estimator_

y_pred_proba = best_model.predict_proba(X_test)[:, 1]

test_roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Test set ROC AUC score: {test_roc_auc: .3f}')

Test set ROC AUC score:  0.694


In [92]:
# let's try it on an untuned dt, to compare
dt = DecisionTreeClassifier(random_state=1)
dt.fit(X_train, y_train)

y_pred_proba = dt.predict_proba(X_test)[:, 1]

test_roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'Test set ROC AUC score: {test_roc_auc: .3f}')

Test set ROC AUC score:  0.573


# [Bike Sharing Demand](https://www.kaggle.com/c/bike-sharing-demand) Dataset

In [97]:
bikes = pd.read_csv('./bikes.csv')
bikes.head()

Unnamed: 0,hr,holiday,workingday,temp,hum,windspeed,cnt,instant,mnth,yr,Clear to partly cloudy,Light Precipitation,Misty
0,0,0,0,0.76,0.66,0.0,149,13004,7,1,1,0,0
1,1,0,0,0.74,0.7,0.1343,93,13005,7,1,1,0,0
2,2,0,0,0.72,0.74,0.0896,90,13006,7,1,1,0,0
3,3,0,0,0.72,0.84,0.1343,33,13007,7,1,1,0,0
4,4,0,0,0.7,0.79,0.194,4,13008,7,1,1,0,0


In [98]:
bikes.shape

(1488, 13)

In [99]:
bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1488 entries, 0 to 1487
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   hr                      1488 non-null   int64  
 1   holiday                 1488 non-null   int64  
 2   workingday              1488 non-null   int64  
 3   temp                    1488 non-null   float64
 4   hum                     1488 non-null   float64
 5   windspeed               1488 non-null   float64
 6   cnt                     1488 non-null   int64  
 7   instant                 1488 non-null   int64  
 8   mnth                    1488 non-null   int64  
 9   yr                      1488 non-null   int64  
 10  Clear to partly cloudy  1488 non-null   int64  
 11  Light Precipitation     1488 non-null   int64  
 12  Misty                   1488 non-null   int64  
dtypes: float64(3), int64(10)
memory usage: 151.2 KB


In [100]:
X = bikes.drop('cnt', axis=1)
y = bikes['cnt']

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1)

In [102]:
X_train.shape

(1190, 12)

In [103]:
X_test.shape

(298, 12)

## Gradient Boosting Regressor

In [56]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

In [61]:
# initiate gb
gb = GradientBoostingRegressor(max_depth=4,
                               n_estimators=200,
                               random_state=3)

In [62]:
# fit and predict
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

In [63]:
# eval
rmse = np.sqrt(mse(y_test, y_pred))
print(f'RMSE of gb: {rmse: .3f}')

RMSE of gb:  42.640


## Stochastic Gradient Boosting

In [64]:
# inititate sgb
sgb = GradientBoostingRegressor(max_depth=4,
                                n_estimators=200,
                                subsample=0.9,
                                max_features=0.75, 
                                random_state=3)

In [65]:
# fit and predict
sgb.fit(X_train, y_train)

y_pred = sgb.predict(X_test)

In [66]:
# eval
rmse = np.sqrt(mse(y_test, y_pred))
print(f'RMSE of sgb: {rmse: .3f}')

RMSE of sgb:  41.648


## Random Forest Hyperparameter Tuning

In [93]:
from sklearn.ensemble import RandomForestRegressor

In [95]:
rf = RandomForestRegressor(random_state=2)

In [94]:
params_rf = {'n_estimators': [100, 350, 500], 
             'max_features': ['log2', 'auto', 'sqrt'], 
             'min_samples_leaf': [2, 10, 30]}

In [96]:
# initiate search object
grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)

In [104]:
# fit to training
grid_rf.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   33.1s finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=2), n_jobs=-1,
             param_grid={'max_features': ['log2', 'auto', 'sqrt'],
                         'min_samples_leaf': [2, 10, 30],
                         'n_estimators': [100, 350, 500]},
             scoring='neg_mean_squared_error', verbose=1)

In [105]:
# evaluate
best_model = grid_rf.best_estimator_

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mse(y_test, y_pred))
print(f'Test RMSE of best model: {rmse: .3f}')

Test RMSE of best model:  51.755
