# Preprocessing

## Libraries

In [14]:
import numpy as np
import pandas as pd
from json import dumps

from sklearn.model_selection import GroupKFold, GroupShuffleSplit, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight

RANDOM_STATE = 1

## Definitions

In [15]:
trees = pd.read_csv('../data/raw/new_york_tree_census_2015.csv')
# print(trees.columns)

target = trees['health']
keep_rows = target.notna()

drop_cols = ['health','tree_id', 'block_id', 'created_at', 'stump_diam','status','spc_common','problems','address',\
             'zip_city','cb_num', 'borocode', 'cncldist', 'st_assem', 'st_senate', 'nta', 'boro_ct', 'state',\
             'latitude', 'longitude']

group = ['nta_name']
spatial_geo = ['x_sp','y_sp'] # encode numerical, standardize
spatial_fine = ['zipcode'] # encode categorical
spatial_coarse = ['boroname'] # encode categorical

## base ftrs, without spatial ##
numerical_ftrs = ['tree_dbh']
categorical_ftrs = ['curb_loc', 'spc_latin', 'user_type', 'root_stone',
                    'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
                    'brch_light', 'brch_shoe', 'brch_other']
ordinal_ftrs = ['steward','guards','sidewalk']
ordinal_cats = [['None','1or2','3or4','4orMore'],
                ['None','Harmful','Unsure','Helpful'],
                ['Damage','None','NoDamage']]

In [16]:
# check we have accounted for all columns
listed = set(drop_cols).union(group,spatial_geo,spatial_fine,spatial_coarse,\
                              numerical_ftrs,categorical_ftrs,ordinal_ftrs)
print(set(trees.columns).difference(listed) == set())

True


In [17]:
# handle ordinal ftrs to fill in na
for ftr in ordinal_ftrs:
    trees[ftr] = trees[ftr].fillna('None')

In [18]:
# define feature sets
drop_cols_set = drop_cols + group # + spatial_fine + spatial_coarse
categorical_ftrs_set = categorical_ftrs + spatial_fine + spatial_coarse
numerical_ftrs_set = numerical_ftrs + spatial_geo

# define a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_ftrs_set),
        ('std', StandardScaler(), numerical_ftrs_set)])

## Sampling

In [19]:
SAMPLE_PROP = 0.01
y = target[keep_rows]
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]

In [20]:
# hold out a test set by groups
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]

In [21]:
# subsample for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

# Cross Validation

## Functions

In [22]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [23]:
random_states = [1,33,42,44,99]

In [31]:
from sklearn.base import BaseEstimator

def run_grid_search(random_states: list[int], model: BaseEstimator, X: pd.DataFrame | np.ndarray, y: pd.DataFrame | np.ndarray, groups: pd.DataFrame, \
                    param_grid: dict, n_jobs: int = -1, fit_params: dict = {}, pipe: Pipeline = None, add_rs: bool = True) \
                    -> tuple[list[np.float64],list[dict], list[BaseEstimator]]:
    scores, params, models = [], [], []
    for rs in random_states:
        gss = GroupShuffleSplit(n_splits=1,random_state=rs) # set to defualt 0.2 test
        pipe = make_pipeline(preprocessor,model) if pipe is None else pipe
        if add_rs:
            name = list(pipe.named_steps.keys())[-1] + '__random_state'
            param_grid[name] = [rs]
        grid = GridSearchCV(
            estimator=pipe, 
            param_grid=param_grid,
            scoring='f1_macro',
            cv=gss,
            verbose=False,
            n_jobs=n_jobs,
            refit=True,
            return_train_score=True
        )
        grid.fit(X, y, groups=groups, **fit_params)

        best_idx = grid.best_index_
        scores.append(grid.cv_results_['mean_test_score'][best_idx])
        params.append(grid.best_params_)
        models.append(grid.best_estimator_)

    return scores, params, models

In [25]:
def print_results(name: str, scores: list[np.float64], params: list[dict], \
                  models: list[BaseEstimator]) -> None:
    print(name.lower(), '----\n')
    indx = np.argmax(scores)

    print('best parameters:')
    print(dumps(params[indx],indent=4))

    print(f'score: {scores[indx]}')
    print(f'mean score: {np.mean(scores)}')
    print(f'std score:  {np.std(scores)}')

    return None

## Logistic Regression, Elastic Net

In [26]:
clf = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=5_000)
param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced']
            }
scores_lr, params_lr, model_lr = \
    run_grid_search(random_states=random_states,
                    model=clf,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=-1)


In [27]:
print_results('logistic regression', scores_lr, params_lr, model_lr)

logistic regression ----

best parameters:
{
    "logisticregression__C": 1,
    "logisticregression__class_weight": "balanced",
    "logisticregression__l1_ratio": 0.3,
    "logisticregression__random_state": 42
}
score: 0.38616238364007627
mean score: 0.37118256858651005
std score:  0.011786660982739948


## SVM Classifier

In [28]:
from sklearn.svm import SVC
class_weights = {
    'Poor': np.sum(y.to_numpy()=='Poor') / len(y),
    'Fair': np.sum(y.to_numpy()=='Fair') / len(y),
    'Good': np.sum(y.to_numpy()=='Good') / len(y)
}
svc = SVC(class_weight=class_weights)
param_grid = {
            'svc__C': [0.01, 0.1, 1, 10],
            'svc__kernel': ['linear','poly','rbf','sigmoid']
            }
scores_svc, params_svc, model_svc = \
    run_grid_search(random_states=random_states,
                    model=svc,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid)


In [29]:
print_results('svm classifier', scores_svc, params_svc, model_svc)

svm classifier ----

best parameters:
{
    "svc__C": 10,
    "svc__kernel": "sigmoid",
    "svc__random_state": 99
}
score: 0.3198561828532294
mean score: 0.3115233981067871
std score:  0.006411327167816232


## KNN Classifier

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = {
            'kneighborsclassifier__n_neighbors': [10,20,30,40,50],
            'kneighborsclassifier__weights': ['uniform','distance'],
            'kneighborsclassifier__n_jobs': [-1]
            }
scores_knn, params_knn, model_knn = \
    run_grid_search(random_states=random_states,
                    model=knn,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=1,
                    add_rs=False)

In [33]:
print_results('knn classifier', scores_knn, params_knn, model_knn)

knn classifier ----

best parameters:
{
    "kneighborsclassifier__n_jobs": -1,
    "kneighborsclassifier__n_neighbors": 10,
    "kneighborsclassifier__weights": "distance"
}
score: 0.32166812107523574
mean score: 0.315715937403959
std score:  0.005763146307245319


## XGBoost Classifier

In [34]:
# sample 5% and to get evaluations set
SAMPLE_PROP = 0.05

In [35]:
y = target[keep_rows]
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(y))
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]

In [36]:
# set aside test set
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
# sample eval
gss_eval = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
sub_idx, eval_idx = next(gss_eval.split(X_sub, y_sub, groups_sub))
X_sub, X_eval = X.iloc[sub_idx], X.iloc[eval_idx]
y_sub, y_eval = y.iloc[sub_idx], y.iloc[eval_idx]
groups_sub, groups_eval = groups.iloc[sub_idx], groups.iloc[eval_idx]
# get weights
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_sub
)

In [37]:
from copy import deepcopy
xg_pre = deepcopy(preprocessor)
xg_pre.fit(X)
X_sub_proc = xg_pre.transform(X_sub)
X_eval_proc = xg_pre.transform(X_eval)

In [56]:
import xgboost as xgb
xgb = xgb.XGBClassifier(early_stopping_rounds=10,verbosity=0)
param_grid = {
            'xgbclassifier__learning_rate': [0.01, 0.1],
            'xgbclassifier__max_depth': [3, 5, 7, 10, 13, 15, 20],
            'xgbclassifier__n_jobs': [-1]
            }
fit_params = {
    'xgbclassifier__sample_weight': sample_weights,
    'xgbclassifier__eval_set': [(X_eval_proc,y_eval)]
}
pipe = make_pipeline(xgb)
scores_xg, params_xg, model_xg = \
    run_grid_search(random_states=random_states,
                    model=xgb,
                    X=X_sub_proc,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=1,
                    fit_params=fit_params,
                    pipe=pipe)

[0]	validation_0-mlogloss:1.09792
[1]	validation_0-mlogloss:1.09724
[2]	validation_0-mlogloss:1.09657
[3]	validation_0-mlogloss:1.09584
[4]	validation_0-mlogloss:1.09519
[5]	validation_0-mlogloss:1.09456
[6]	validation_0-mlogloss:1.09394
[7]	validation_0-mlogloss:1.09325
[8]	validation_0-mlogloss:1.09265
[9]	validation_0-mlogloss:1.09205
[10]	validation_0-mlogloss:1.09138
[11]	validation_0-mlogloss:1.09079
[12]	validation_0-mlogloss:1.09014
[13]	validation_0-mlogloss:1.08958
[14]	validation_0-mlogloss:1.08905
[15]	validation_0-mlogloss:1.08842
[16]	validation_0-mlogloss:1.08787
[17]	validation_0-mlogloss:1.08734
[18]	validation_0-mlogloss:1.08673
[19]	validation_0-mlogloss:1.08620
[20]	validation_0-mlogloss:1.08570
[21]	validation_0-mlogloss:1.08513
[22]	validation_0-mlogloss:1.08463
[23]	validation_0-mlogloss:1.08411
[24]	validation_0-mlogloss:1.08361
[25]	validation_0-mlogloss:1.08311
[26]	validation_0-mlogloss:1.08259
[27]	validation_0-mlogloss:1.08211
[28]	validation_0-mlogloss:1.0

In [55]:
print_results('xgb classifier', scores_xg, params_xg, model_xg)
indx = np.argmax(scores_xg)
print(f"best trees: {model_xg[indx].named_steps['xgbclassifier'].best_iteration}")

xgb classifier ----

best parameters:
{
    "xgbclassifier__learning_rate": 0.1,
    "xgbclassifier__max_depth": 15,
    "xgbclassifier__n_jobs": -1,
    "xgbclassifier__random_state": 33
}
score: 0.40694055399937756
mean score: 0.38689860801642983
std score:  0.012621969922706684
best trees: 99


# Final Model

`xgb classifier`->

* `learning_rate`: 0.1
* `max_depth`: 15
* `num_estimators`: 99
* `n_jobs`: -1

In [None]:
final = xgb.XGBClassifier()