# Preprocessing

### Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, GroupShuffleSplit, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight

RANDOM_STATE = 1

### Definitions

In [2]:
trees = pd.read_csv('../data/raw/new_york_tree_census_2015.csv')
# print(trees.columns)

target = trees['health']
keep_rows = target.notna()

drop_cols = ['health','tree_id', 'block_id', 'created_at', 'stump_diam','status','spc_common','problems','address',\
             'zip_city','cb_num', 'borocode', 'cncldist', 'st_assem', 'st_senate', 'nta', 'boro_ct', 'state',\
             'latitude', 'longitude']

group = ['nta_name']
spatial_geo = ['x_sp','y_sp'] # encode numerical, standardize
spatial_fine = ['zipcode'] # encode categorical
spatial_coarse = ['boroname'] # encode categorical

## base ftrs, without spatial ##
numerical_ftrs = ['tree_dbh']
categorical_ftrs = ['curb_loc', 'spc_latin', 'user_type', 'root_stone',
                    'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
                    'brch_light', 'brch_shoe', 'brch_other']
ordinal_ftrs = ['steward','guards','sidewalk']
ordinal_cats = [['None','1or2','3or4','4orMore'],
                ['None','Harmful','Unsure','Helpful'],
                ['Damage','None','NoDamage']]

In [3]:
# check we have accounted for all columns
listed = set(drop_cols).union(group,spatial_geo,spatial_fine,spatial_coarse,\
                              numerical_ftrs,categorical_ftrs,ordinal_ftrs)
print(set(trees.columns).difference(listed) == set())

True


In [4]:
# handle ordinal ftrs to fill in na
for ftr in ordinal_ftrs:
    trees[ftr] = trees[ftr].fillna('None')

In [5]:
# define feature sets
drop_cols_set = drop_cols + group # + spatial_fine + spatial_coarse
categorical_ftrs_set = categorical_ftrs + spatial_fine + spatial_coarse
numerical_ftrs_set = numerical_ftrs + spatial_geo

# define a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_ftrs_set),
        ('std', StandardScaler(), numerical_ftrs_set)])

### Sampling

In [6]:
SAMPLE_PROP = 0.01
y = target[keep_rows]
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]

In [7]:
# hold out a test set by groups
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]

In [8]:
# subsample for cross validation
X_sub, _, y_sub, _, groups_sub, _ = train_test_split(
    X_train, y_train, groups_train,
    train_size=SAMPLE_PROP,
    stratify=y_train,
    random_state=RANDOM_STATE
)

# Cross Validation

In [9]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)

In [10]:
random_states = [1,33,42,44,99]

In [11]:
from sklearn.base import BaseEstimator

def run_grid_search(random_states: list[int], model: BaseEstimator, X: pd.DataFrame | np.ndarray, y: pd.DataFrame | np.ndarray, groups: pd.DataFrame, \
                    param_grid: dict, n_jobs: int = -1, fit_params: dict = {}, pipe: Pipeline = None) \
                    -> tuple[list[np.float64],list[dict], list[BaseEstimator]]:
    scores, params, models = [], [], []
    for rs in random_states:
        gss = GroupShuffleSplit(n_splits=1,random_state=rs) # set to defualt 0.2 test
        pipe = make_pipeline(preprocessor,model) if pipe is None else pipe
        name = list(pipe.named_steps.keys())[-1] + '__random_state'
        param_grid[name] = [rs]
        grid = GridSearchCV(
            estimator=pipe, 
            param_grid=param_grid,
            scoring='f1_macro',
            cv=gss,
            verbose=False,
            n_jobs=n_jobs,
            refit=True,
            return_train_score=True
        )
        grid.fit(X, y, groups=groups, **fit_params)

        best_idx = grid.best_index_
        scores.append(grid.cv_results_['mean_test_score'][best_idx])
        params.append(grid.best_params_)
        models.append(grid.best_estimator_)

    return scores, params, models

### Logistic Regression, Elastic Net

In [12]:
clf = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=5_000)
param_grid = {
            'logisticregression__C': [0.01, 0.1, 1, 10], 
            'logisticregression__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'logisticregression__class_weight': ['balanced']
            }
scores_lr, params_lr, model_lr = \
    run_grid_search(random_states=random_states,
                    model=clf,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=-1)


KeyboardInterrupt: 

### SVM Classifier

In [None]:
from sklearn.svm import SVC
class_weights = {
    'Poor': np.sum(y.to_numpy()=='Poor') / len(y),
    'Fair': np.sum(y.to_numpy()=='Fair') / len(y),
    'Good': np.sum(y.to_numpy()=='Good') / len(y)
}
svc = SVC(class_weight=class_weights)
param_grid = {
            'svc__C': [0.01, 0.1, 1, 10],
            'svc__kernel': ['linear','poly','rbf','sigmoid']
            }
scores_svc, params_svc, model_svc = \
    run_grid_search(random_states=random_states,
                    model=svc,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid)


### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
param_grid = {
            'kneighborsclassifier__n_neighbors': [10,20,30,40,50],
            'kneighborsclassifier__weights': ['uniform','distance'],
            'kneighborsclassifier__n_jobs': [-1]
            }
scores_knn, params_knn, model_knn = \
    run_grid_search(random_states=random_states,
                    model=knn,
                    X=X_sub,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=1)

### XGBoost Classifier

In [59]:
# sample 5% and to get evaluations set
SAMPLE_PROP = 0.05

In [60]:
y = target[keep_rows]
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(y))
X = trees[keep_rows].drop(drop_cols_set, axis=1)
groups = trees[keep_rows][group]

In [61]:
# set aside test set
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]
# sample eval
gss_eval = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
sub_idx, eval_idx = next(gss_eval.split(X_sub, y_sub, groups_sub))
X_sub, X_eval = X.iloc[sub_idx], X.iloc[eval_idx]
y_sub, y_eval = y.iloc[sub_idx], y.iloc[eval_idx]
groups_sub, groups_eval = groups.iloc[sub_idx], groups.iloc[eval_idx]
# get weights
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_sub
)

In [62]:
from copy import deepcopy
xg_pre = deepcopy(preprocessor)
xg_pre.fit(X)
X_sub_proc = xg_pre.transform(X_sub)
X_eval_proc = xg_pre.transform(X_eval)

In [78]:
import xgboost as xgb
xgb = xgb.XGBClassifier(early_stopping_rounds=10,verbosity=0)
param_grid = {
            'xgbclassifier__learning_rate': [0.1],
            'xgbclassifier__max_depth': [3, 5, 7, 10],
            'xgbclassifier__n_jobs': [-1]
            }
fit_params = {
    'xgbclassifier__sample_weight': sample_weights,
    'xgbclassifier__eval_set': [(X_eval_proc,y_eval)]
}
pipe = make_pipeline(xgb)
scores_xg, params_xg, model_xg = \
    run_grid_search(random_states=random_states,
                    model=xgb,
                    X=X_sub_proc,
                    y=y_sub,
                    groups=groups_sub,
                    param_grid=param_grid,
                    n_jobs=1,
                    fit_params=fit_params,
                    pipe=pipe)

[0]	validation_0-mlogloss:1.09108
[1]	validation_0-mlogloss:1.08496
[2]	validation_0-mlogloss:1.07975
[3]	validation_0-mlogloss:1.07396
[4]	validation_0-mlogloss:1.07002
[5]	validation_0-mlogloss:1.06627
[6]	validation_0-mlogloss:1.06276
[7]	validation_0-mlogloss:1.05949
[8]	validation_0-mlogloss:1.05639
[9]	validation_0-mlogloss:1.05444
[10]	validation_0-mlogloss:1.05152
[11]	validation_0-mlogloss:1.04875
[12]	validation_0-mlogloss:1.04647
[13]	validation_0-mlogloss:1.04482
[14]	validation_0-mlogloss:1.04300
[15]	validation_0-mlogloss:1.04099
[16]	validation_0-mlogloss:1.03913
[17]	validation_0-mlogloss:1.03709
[18]	validation_0-mlogloss:1.03607
[19]	validation_0-mlogloss:1.03511
[20]	validation_0-mlogloss:1.03282
[21]	validation_0-mlogloss:1.03162
[22]	validation_0-mlogloss:1.03065
[23]	validation_0-mlogloss:1.02924
[24]	validation_0-mlogloss:1.02825
[25]	validation_0-mlogloss:1.02514
[26]	validation_0-mlogloss:1.02351
[27]	validation_0-mlogloss:1.02297
[28]	validation_0-mlogloss:1.0