##### preprocessing

In [2]:
import numpy as np
import pandas as pd

trees = pd.read_csv('../data/raw/new_york_tree_census_2015.csv')
# print(trees.columns)

target = trees['health']
keep_rows = target.notna()

drop_cols = ['health','tree_id', 'block_id', 'created_at', 'stump_diam','status','spc_common','problems','address',\
             'zip_city','cb_num', 'borocode', 'cncldist', 'st_assem', 'st_senate', 'nta', 'boro_ct', 'state',\
             'latitude', 'longitude']

group = ['nta_name']
spatial_geo = ['x_sp','y_sp'] # encode numerical, standardize
spatial_fine = ['zipcode'] # encode categorical
spatial_coarse = ['boroname'] # encode categorical

## base ftrs, without spatial ##
numerical_ftrs = ['tree_dbh']
categorical_ftrs = ['curb_loc', 'spc_latin', 'user_type', 'root_stone',
                    'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
                    'brch_light', 'brch_shoe', 'brch_other']
ordinal_ftrs = ['steward','guards','sidewalk']
ordinal_cats = [['None','1or2','3or4','4orMore'],
                ['None','Harmful','Unsure','Helpful'],
                ['Damage','None','NoDamage']]

In [3]:
# check we have accounted for all columns
listed = set(drop_cols).union(group,spatial_geo,spatial_fine,spatial_coarse,\
                              numerical_ftrs,categorical_ftrs,ordinal_ftrs)
print(set(trees.columns).difference(listed) == set())

True


In [4]:
# handle ordinal ftrs to fill in na
for ftr in ordinal_ftrs:
    trees[ftr] = trees[ftr].fillna('None')

In [None]:
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.linear_model import LogisticRegression
## CROSS VALIDATION PIPELINE ##
RANDOM_STATE = 1
# define feature sets
drop_cols_set = drop_cols + group # + spatial_fine + spatial_coarse
categorical_ftrs_set = categorical_ftrs + spatial_fine + spatial_coarse
numerical_ftrs_set = numerical_ftrs + spatial_geo

# define a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_ftrs_set),
        ('std', StandardScaler(), numerical_ftrs_set)])
# define splitters
# gkf = GroupKFold(n_splits=4,shuffle=True,random_state=RANDOM_STATE) # not using gkf due to time constraint
gss = GroupShuffleSplit(n_splits=1,random_state=RANDOM_STATE)
# define algo
algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
# make pipeline
pipe = make_pipeline(preprocessor,algo)
# define params
param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__random_state': [RANDOM_STATE]
            }
# make grid
grid = GridSearchCV(
    estimator=pipe, 
    param_grid=param_grid, 
    scoring='f1_macro',
    cv=gss,
    verbose=2,
    n_jobs=-1,
    refit=False
)

In [None]:
# DATA SAMPLING #
SAMPLE_PROP = 0.05
y = target[keep_rows]
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]
## hold out a test set by groups
gss_test = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
## sample 10% for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

```python
# testing different hyperparameters
random_states = [1,33,42,44,99]
results = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=False
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results.append(grid.best_params_)
```

```python
from json import dumps
print(dumps(results,indent=4))
```

In [9]:
# final hyperparameters
# C = 10
# l1_ratio = 0.1

#### cross validation on logistic regression with elastic net

In [None]:
# testing balanced vs None class_weights
random_states = [1,99]
results_scores = []
results_params = []
results_model = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced', None],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results_params.append(grid.best_params_)
    results_scores.append(grid.cv_results_)
    results_model.append(grid.best_estimator_)

Fitting 1 folds for each of 40 candidates, totalling 40 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time=  38.2s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time=  39.4s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time=  42.7s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time=  44.0s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=  45.9s
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_rat



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 9.5min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 9.7min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 9.9min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=10.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 9.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 9.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 9.3min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 9.0min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time=10.0min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 8.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=1; total time= 8.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=1; total time= 8.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=1; total time= 8.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=1; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=1; total time= 8.0min
Fitting 1 folds for each of 40 candidates, totalling 40 fits
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.6min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 1.6min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.7min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 1.8min
[CV] END logisticregression__C=0.01, logisticregression__class_weight=balanced, logisticregression__l1_r



[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.5min
[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.0min




[CV] END logisticregression__C=1, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.2min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.4min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.1min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 6.8min




[CV] END logisticregression__C=1, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.7min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.6min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=balanced, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 7.9min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.1, logisticregression__random_state=99; total time= 7.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.3, logisticregression__random_state=99; total time= 7.5min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.5, logisticregression__random_state=99; total time= 7.4min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.7, logisticregression__random_state=99; total time= 7.2min




[CV] END logisticregression__C=10, logisticregression__class_weight=None, logisticregression__l1_ratio=0.9, logisticregression__random_state=99; total time= 6.8min


In [None]:
from json import dumps
print(dumps(results_params,indent=4))
print(dumps(results_scores,indent=4))

##### cross validation on logistic regression: `stratified vs group`

In [None]:
# testing using random splitting instead of Group
from sklearn.model_selection import StratifiedShuffleSplit
random_states = [1,99]
results_scores = []
results_params = []
results_model = []
for rs in random_states:
    gss = StratifiedShuffleSplit(n_splits=1,random_state=rs)
    algo = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000)
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced', None],
            'logisticregression__random_state': [rs]
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        n_jobs=-1,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub, groups=groups_sub)
    results_params.append(grid.best_params_)
    results_scores.append(grid.cv_results_)
    results_model.append(grid.best_estimator_)


##### cross validation on XGBClassifier

In [None]:
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight

# sample 5% for training
SAMPLE_PROP = 0.05
y = target[keep_rows]
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]
## hold out a test set by groups
gss_test = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
## sample 10% for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

# get weights
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_sub
)

In [None]:
# testing using XGBoost
random_states = [1,99]
results_scores = []
results_params = []
results_model = []
for rs in random_states:
    gss = GroupShuffleSplit(n_splits=1,random_state=rs)
    algo = xgb.XGBClassifier()
    pipe = make_pipeline(preprocessor,algo)
    param_grid = {
            'xgbclassifier__n_estimators': [100, 200, 300],
            'xgbclassifier__learning_rate': [0.1],
            'xgbclassifier__max_depth': [3, 5, 7],
            'xgbclassifier__n_jobs': [-1],
            'xgbclassifier__random_state': [rs],
            }
    grid = GridSearchCV(
        estimator=pipe, 
        param_grid=param_grid, 
        scoring='f1_macro',
        cv=gss,
        verbose=2,
        refit=True,
        return_train_score=True
    )
    grid.fit(X_sub, y_sub, groups=groups_sub, sample_weight=sample_weights)
    results_params.append(grid.best_params_)
    results_scores.append(grid.cv_results_)
    results_model.append(grid.best_estimator_)