##### preprocessing

In [2]:
import numpy as np
import pandas as pd

trees = pd.read_csv('../data/raw/new_york_tree_census_2015.csv')
# print(trees.columns)

target = trees['health']
keep_rows = target.notna()

drop_cols = ['health','tree_id', 'block_id', 'created_at', 'stump_diam','status','spc_common','problems','address',\
             'zip_city','cb_num', 'borocode', 'cncldist', 'st_assem', 'st_senate', 'nta', 'boro_ct', 'state',\
             'latitude', 'longitude']

group = ['nta_name']
spatial_geo = ['x_sp','y_sp'] # encode numerical, standardize
spatial_fine = ['zipcode'] # encode categorical
spatial_coarse = ['boroname'] # encode categorical

## base ftrs, without spatial ##
numerical_ftrs = ['tree_dbh']
categorical_ftrs = ['curb_loc', 'spc_latin', 'user_type', 'root_stone',
                    'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
                    'brch_light', 'brch_shoe', 'brch_other']
ordinal_ftrs = ['steward','guards','sidewalk']
ordinal_cats = [['None','1or2','3or4','4orMore'],
                ['None','Harmful','Unsure','Helpful'],
                ['Damage','None','NoDamage']]

In [3]:
# check we have accounted for all columns
listed = set(drop_cols).union(group,spatial_geo,spatial_fine,spatial_coarse,\
                              numerical_ftrs,categorical_ftrs,ordinal_ftrs)
print(set(trees.columns).difference(listed) == set())

True


In [4]:
# handle ordinal ftrs to fill in na
for ftr in ordinal_ftrs:
    trees[ftr] = trees[ftr].fillna('None')

In [None]:
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.linear_model import LogisticRegression
## CROSS VALIDATION PIPELINE ##
RANDOM_STATE = 1
# define feature sets
drop_cols_set = drop_cols + group # + spatial_fine + spatial_coarse
categorical_ftrs_set = categorical_ftrs + spatial_fine + spatial_coarse
numerical_ftrs_set = numerical_ftrs + spatial_geo

# define a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_ftrs_set),
        ('std', StandardScaler(), numerical_ftrs_set)])
# define splitters
# gkf = GroupKFold(n_splits=4,shuffle=True,random_state=RANDOM_STATE) # not using gkf due to time constraint
gss = GroupShuffleSplit(n_splits=1,random_state=RANDOM_STATE)
# define algo
algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
# make pipeline
pipe = make_pipeline(preprocessor,algo)
# define params
param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__random_state': [RANDOM_STATE]
            }
# make grid
grid = GridSearchCV(
    estimator=pipe, 
    param_grid=param_grid, 
    scoring='f1_macro',
    cv=gss,
    verbose=2,
    n_jobs=-1,
    refit=False
)

In [None]:
# DATA SAMPLING #
SAMPLE_PROP = 0.05
y = target[keep_rows]
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]
## hold out a test set by groups
gss_test = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss_test.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
## sample 10% for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

```python
# testing different hyperparameters
random_states = [1,33,42,44,99]
results = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=False
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results.append(grid.best_params_)
```

```python
from json import dumps
print(dumps(results,indent=4))
```

In [9]:
# final hyperparameters
# C = 10
# l1_ratio = 0.1

#### cross validation on logistic regression with elastic net

In [10]:
# testing balanced vs None class_weights
random_states = [1,99]
results_scores = []
results_params = []
results_model = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced', None],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results_params.append(grid.best_params_)
    results_scores.append(grid.cv_results_)
    results_model.append(grid.best_estimator_)

Fitting 1 folds for each of 40 candidates, totalling 40 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time=  38.2s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time=  39.4s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time=  42.7s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time=  44.0s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=  45.9s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_rat



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 9.5min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 9.7min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 9.9min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=10.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 9.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 9.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 9.3min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 9.0min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=10.0min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 8.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 8.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.0min
Fitting 1 folds for each of 40 candidates, totalling 40 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.6min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.6min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.7min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.8min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_r



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.5min
[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.2min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.4min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.1min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 6.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.2min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.8min


In [17]:
from json import dumps
print(dumps(results_params,indent=4))
# print(results_scores)
# print(results_model)

# our best hyperparameters are:
# C = 0.1, L1 = 0.5 or 0.7
# class_weight = balanced

[
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.7,
        "logisticregression__random_state": 1
    },
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.5,
        "logisticregression__random_state": 99
    }
]


##### cross validation on logistic regression: `stratified vs group`
We train only on `'balanced'` class weights, but use all hyperparameter combos.

In [None]:
# testing using random splitting instead of Group
from sklearn.model_selection import StratifiedShuffleSplit
random_states = [1,99]
results_scores = []
results_params = []
results_model = []
results_grid = []
for rs in random_states:
    gss = StratifiedShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced'],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub)
    results_params.append(grid.best_params_)
    results_scores.append(grid.cv_results_)
    results_model.append(grid.best_estimator_)
    results_grid.append(grid)

Fitting 1 folds for each of 20 candidates, totalling 20 fits




[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 1.1min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 1.2min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 1.3min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 1.4min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 2.4min
[CV] END logisticregression__C=0.1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total 



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 9.1min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 9.1min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 8.2min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 8.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 8.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 8.0min
Fitting 1 folds for each of 20 candidates, totalling 20 fits




[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.7min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.7min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 1.8min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 2.0min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 3.0min
[CV] END logisticregression__C=0.1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; 



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 8.3min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 8.1min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 8.7min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 8.7min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 7.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 7.1min


In [None]:
print(dumps(results_params,indent=4))
# print(results_scores)
cv_df = pd.DataFrame(results_scores[0])
mask = (
    (cv_df['param_logisticregression__C'] == 0.1) &
    (cv_df['param_logisticregression__l1_ratio'] == 0.3) &
    (cv_df['param_logisticregression__class_weight'] == 'balanced')
)
mean_test_score = cv_df.loc[mask, 'mean_test_score'].iloc[0]
print(f'C 0.1, l1 0.3: {mean_test_score}')

cv_df = pd.DataFrame(results_scores[1])
mask = (
    (cv_df['param_logisticregression__C'] == 0.1) &
    (cv_df['param_logisticregression__l1_ratio'] == 0.9) &
    (cv_df['param_logisticregression__class_weight'] == 'balanced')
)
mean_test_score = cv_df.loc[mask, 'mean_test_score'].iloc[0]
print(f'C 0.1, l1 0.9: {mean_test_score}')

# print(results_model)

# our best hyperparameters are:
# C = 0.1, L1 = 0.3
# class_weight = balanced
# shuffling = random
# score = 0.37142152187216704

[
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.3,
        "logisticregression__random_state": 1
    },
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.9,
        "logisticregression__random_state": 99
    }
]
C 0.1, l1 0.3: 0.37142152187216704
C 0.1, l1 0.9: 0.3682450200379508


In [23]:
# GROUP
random_states = [1,99]
results_scores_g = []
results_best_scores_g = []
results_params_g = []
results_model_g = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced'],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results_params_g.append(grid.best_params_)
    results_scores_g.append(grid.cv_results_)
    best_idx = grid.best_index_
    best_val_score = grid.cv_results_['mean_test_score'][best_idx]
    results_best_scores_g.append(best_val_score)
    results_model_g.append(grid.best_estimator_)

Fitting 1 folds for each of 20 candidates, totalling 20 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 1.1min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 1.1min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 1.3min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 1.3min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 2.3min
[CV] END logisticregression__C=0.1, logisticregression__class_weight=balanced, logisticregress



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 7.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 7.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 7.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 7.8min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 7.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 7.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 7.4min
Fitting 1 folds for each of 20 candidates, totalling 20 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.2min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.3min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 1.4min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 1.6min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregr



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 6.7min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.2min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.3min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 6.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 6.8min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 6.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 6.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.4min


In [None]:
print(dumps(results_params_g,indent=4))
print(results_best_scores_g)
# print(results_scores)
# cv_df = pd.DataFrame(results_scores[0])
# mask = (
#     (cv_df['param_logisticregression__C'] == 0.1) &
#     (cv_df['param_logisticregression__l1_ratio'] == 0.3) &
#     (cv_df['param_logisticregression__class_weight'] == 'balanced')
# )
# mean_test_score = cv_df.loc[mask, 'mean_test_score'].iloc[0]
# print(f'C 0.1, l1 0.3: {mean_test_score}')

# cv_df = pd.DataFrame(results_scores[1])
# mask = (
#     (cv_df['param_logisticregression__C'] == 0.1) &
#     (cv_df['param_logisticregression__l1_ratio'] == 0.9) &
#     (cv_df['param_logisticregression__class_weight'] == 'balanced')
# )
# mean_test_score = cv_df.loc[mask, 'mean_test_score'].iloc[0]
# print(f'C 0.1, l1 0.9: {mean_test_score}')

# print(results_model)

# our best hyperparameters are:
# C = 0.1, l1_ratio = 0.7
# class_weight = balanced
# shuffling = group
# score = 0.382426793855901

[
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.7,
        "logisticregression__random_state": 1
    },
    {
        "logisticregression__C": 0.1,
        "logisticregression__class_weight": "balanced",
        "logisticregression__l1_ratio": 0.5,
        "logisticregression__random_state": 99
    }
]
[np.float64(0.382426793855901), np.float64(0.35639615256770846)]


##### cross validation on XGBClassifier

In [37]:
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import LabelEncoder
RANDOM_STATE = 1
# sample 5% for training
SAMPLE_PROP = 0.05
# encode as numbers for xgboost 
y = target[keep_rows]
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(y))

X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]
## hold out a test set by groups
gss_test = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
## sample 10% for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

# get weights
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_sub
)

In [40]:
# testing using XGBoost
random_states = [1,99]
results_scores_xg = []
results_best_scores_xg = []
results_params_xg = []
results_model_xg = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = xgb.XGBClassifier()
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'xgbclassifier__n_estimators': [100, 200, 300],
            'xgbclassifier__learning_rate': [0.1],
            'xgbclassifier__max_depth': [3, 5, 7],
            'xgbclassifier__n_jobs': [-1],
            'xgbclassifier__random_state': [rs],
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, 
             y_sub, 
             groups=groups_sub,
             xgbclassifier__sample_weight=sample_weights)
    results_params_xg.append(grid.best_params_)
    results_scores_xg.append(grid.cv_results_)
    best_idx = grid.best_index_
    best_val_score = grid.cv_results_['mean_test_score'][best_idx]
    results_best_scores_xg.append(best_val_score)
    results_model_xg.append(grid.best_estimator_)

Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   0.6s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.7s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=300, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.5s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   0.8s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.9s
[CV] END xgbclassifier__learning_rate=0.1

In [None]:
print(dumps(results_params_xg, indent=4))
print(results_best_scores_xg)

# best params are
# max_depth = 7
# n_estimators = 300
# using Group split

[
    {
        "xgbclassifier__learning_rate": 0.1,
        "xgbclassifier__max_depth": 7,
        "xgbclassifier__n_estimators": 300,
        "xgbclassifier__n_jobs": -1,
        "xgbclassifier__random_state": 1
    },
    {
        "xgbclassifier__learning_rate": 0.1,
        "xgbclassifier__max_depth": 7,
        "xgbclassifier__n_estimators": 100,
        "xgbclassifier__n_jobs": -1,
        "xgbclassifier__random_state": 99
    }
]
[np.float64(0.4084642110225578), np.float64(0.3817579157682778)]


In [45]:
# testing using XGBoost WITHOUT Group Splitting 
random_states = [1,99]
results_scores_xg_ss = []
results_best_scores_xg_ss = []
results_params_xg_ss = []
results_model_xg_ss = []
for rs in random_states:
    gss = StratifiedShuffleSplit(n_splits=1,random_state=rs)
    algo = xgb.XGBClassifier()
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'xgbclassifier__n_estimators': [100, 200, 300],
            'xgbclassifier__learning_rate': [0.1],
            'xgbclassifier__max_depth': [3, 5, 7],
            'xgbclassifier__n_jobs': [-1],
            'xgbclassifier__random_state': [rs],
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, 
             y_sub,
             xgbclassifier__sample_weight=sample_weights)
    results_params_xg_ss.append(grid.best_params_)
    results_scores_xg_ss.append(grid.cv_results_)
    best_idx = grid.best_index_
    best_val_score = grid.cv_results_['mean_test_score'][best_idx]
    results_best_scores_xg_ss.append(best_val_score)
    results_model_xg_ss.append(grid.best_estimator_)

Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=100, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   0.6s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=200, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.1s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=3, xgbclassifier__n_estimators=300, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.6s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=100, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   0.8s
[CV] END xgbclassifier__learning_rate=0.1, xgbclassifier__max_depth=5, xgbclassifier__n_estimators=200, xgbclassifier__n_jobs=-1, xgbclassifier__random_state=1; total time=   1.5s
[CV] END xgbclassifier__learning_rate=0.1

In [46]:
print(dumps(results_params_xg_ss, indent=4))
print(results_best_scores_xg_ss)

# best params are
# max_depth = 7
# n_estimators = 300
# using Stratified split

[
    {
        "xgbclassifier__learning_rate": 0.1,
        "xgbclassifier__max_depth": 7,
        "xgbclassifier__n_estimators": 300,
        "xgbclassifier__n_jobs": -1,
        "xgbclassifier__random_state": 1
    },
    {
        "xgbclassifier__learning_rate": 0.1,
        "xgbclassifier__max_depth": 7,
        "xgbclassifier__n_estimators": 300,
        "xgbclassifier__n_jobs": -1,
        "xgbclassifier__random_state": 99
    }
]
[np.float64(0.42164081542427595), np.float64(0.4105136125596484)]


##### Try `xgboost` models on test data

In [None]:
# train a bigger model

xg_model = results_model_xg[0]
test_score_xg = fbeta_score(y_test, xg_model.predict(X_test), beta=1, average='macro')
print(f'xg group split: {test_score_xg}')


xg group split: 0.39001143285292533
xg stratify split: 0.39001143285292533


In [None]:

xg_ss_model = results_model_xg_ss[0]
test_score_xg_ss = fbeta_score(y_test, xg_ss_model.predict(X_test), beta=1, average='macro')
print(f'xg stratify split: {test_score_xg_ss}')