## Feature Selection on Tree-Based Models

In [None]:
import time
import json
from tqdm import tqdm
from itertools import compress
from collections import defaultdict
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector # requires sklearn 0.24 and above
from features import feature_pipeline
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Data Loading

In [None]:
with open("../data/train_features.json", "r", encoding="utf-8") as f:
    train = pd.DataFrame(json.load(f))
with open("../data/val_features.json", "r", encoding="utf-8") as f:
    val = pd.DataFrame(json.load(f))
with open("../data/test_features.json", "r", encoding="utf-8") as f:
    test = pd.DataFrame(json.load(f))

X_train = train.drop(["preprocessed_text", "level"], axis=1)
y_train = train["level"].tolist()
X_val = val.drop(["preprocessed_text", "level"], axis=1)
y_val = val["level"].tolist()
X_test = test.drop(["preprocessed_text", "level"], axis=1)
y_test = test["level"].tolist()

y_train_binary = []
for lvl in train["level"].tolist():
    y_train_binary.append(lvl[0])
y_val_binary = []
for lvl in val["level"].tolist():
    y_val_binary.append(lvl[0])
y_test_binary = []
for lvl in test["level"].tolist():
    y_test_binary.append(lvl[0])

In [None]:
print(y_val)

In [None]:
print(y_val_binary)

In [None]:
feat_names = list(X_train.columns)
print(f"All features:\n{feat_names}\n\nNumber of features: {len(feat_names)}")

scoring = accuracy_score

In [None]:
X_train.head()

### Set up Functions

In [None]:
def cross_validate(pipeline, X_train, y_train, X_val, y_val):
    """"""
    start = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - start

    train_score = pipeline.score(X_train, y=y_train)

    start = time.time()
    val_score = pipeline.score(X_val, y=y_val)
    val_pred_time = time.time() - start

    return {
        "Training_score": train_score,
        "Validation_score": val_score,
        "Training_time": train_time,
        "Prediction_time": val_pred_time,
    }

In [None]:
def cv_and_display(preprocessor, model, name, results_df, fine_grained=False):
    """
    train model and return the cross validation results

    preprocessor: (sklearn ColumnTransformer) sklearn object for feature transformation
    model: (sklearn Classifier) initialized sklearn classifier
    name: (str) a name that is shown when the result is displayed
    results_df: (dict) the dictionary to store cross validation results
    fine_grained: (str) {True, False} True: model trains with 3 class classification instead of 2. Default is False

    return: (dict) results_df
    """
    pipeline = make_pipeline(preprocessor, model)
    if fine_grained:
        y_t = y_train
        y_v = y_val
    else:
        y_t = y_train_binary
        y_v = y_val_binary

    scores = cross_validate(pipeline, X_train, y_t, X_val, y_v)

    results_df[name] = scores

    return results_df

## Forward feature selection - Greedy
Forward feature selection (greedy) is the feature selection process accomplished through sklearn's `SequentialFeatureSelector` function. It determines the top k features (k is a variable) using feature importance determined by the model

In [None]:
def perform_sfs_cv_and_display(
    preprocessor,
    model,
    name,
    results_df,
    n_features,
    direction,
    fine_grained=False,
):
    """
    generate a feature selection pipeline for svm models and perform cross validation

    preprocessor: (sklearn ColumnTransformer) sklearn object for feature transformation
    model: (sklearn Classifier) initialized sklearn classifier
    name: (str) a name that is shown when the result is displayed
    results_df: (dict) the dictionary to store cross validation results
    n_features: (int) argument passed into the `n_features_to_select` argument in SequentialFeatureSelector
    direction: (str) {'forward', 'backward'}, argument passe dinto the direction argument in SequentialFeatureSelector
    fine_grained: (str) {True, False} True: model trains with 3 class classification instead of 2. Default is False

    return: (dict) results_df
    """
    # initialize selector
    sfs = SequentialFeatureSelector(
        model,
        n_features_to_select=n_features,
        scoring="accuracy",
        direction=direction,
        cv=2,
        n_jobs=-1,
    )
    sfs_pipeline = make_pipeline(preprocessor, sfs, model)

    if fine_grained:
        y_t = y_train
        y_v = y_val
    else:
        y_t = y_train_binary
        y_v = y_val_binary

    # fit
    sfs_pipeline.fit(X_train, y_t)

    # features selected
    feats_selected = list(compress(X_train.columns, sfs_pipeline[1].get_support()))
#     print(f"features selected:\n{feats_selected}")

    # subset data
    cv_X_train = X_train[feats_selected]
    cv_X_val = X_val[feats_selected]

    # cv with selected features
    cv_preprocessor = make_column_transformer((StandardScaler(), feats_selected))
    pipeline = make_pipeline(cv_preprocessor, model)

    scores = cross_validate(pipeline, cv_X_train, y_t, cv_X_val, y_v)

    results_df[f"{name} + {n_features} features"] = scores

    return results_df, feats_selected

### Binary Classification

In [None]:
binary_results = {}

preprocessor = make_column_transformer((StandardScaler(), feat_names))

classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
    "XGBoost": XGBClassifier(random_state=123),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=123),
}

for (name, model) in classifiers.items():
    if name == "DecisionTree":
        for d in range(1, 9):
            model = DecisionTreeClassifier(random_state=123, max_depth=d)
            name = f"{name} + depth {d}"

            results_df = cv_and_display(
                preprocessor, model, name, binary_results, fine_grained=False
            )
    else:
        results_df = cv_and_display(
            preprocessor, model, name, binary_results, fine_grained=False
        )

display(pd.DataFrame(binary_results))

In [None]:
# loop through models and number of features to find the best combination
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

binary_feats = defaultdict(list)
for (name, model) in classifiers.items():
    for i in tqdm(
        range(1, len(feat_names)), desc=f"{name} Feature Selection", dynamic_ncols=True
    ):
        if name == "DecisionTree":
            for d in range(1, 9):
                model = DecisionTreeClassifier(random_state=123, max_depth=d)
                name = f"{name} + depth {d}"

                results_df, feats = perform_sfs_cv_and_display(
                    preprocessor,
                    model,
                    name,
                    binary_results,
                    n_features=i,
                    direction="forward",
                    fine_grained=False,
                )

                binary_feats[name].append(feats)
        else:
            results_df, feats = perform_sfs_cv_and_display(
                preprocessor,
                model,
                name,
                binary_results,
                n_features=i,
                direction="forward",
                fine_grained=False,
            )

            binary_feats[name].append(feats)

In [None]:
# sort by Validation_score and display
display(pd.DataFrame(binary_results).sort_values("Validation_score", 1, ascending=False))

### 3-Class Classification

In [None]:
multiclass_results = {}

preprocessor = make_column_transformer((StandardScaler(), feat_names))

classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
    "XGBoost": XGBClassifier(random_state=123),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=123),
}

for (name, model) in classifiers.items():
    if name == "DecisionTree":
        for d in range(1, 9):
            model = DecisionTreeClassifier(random_state=123, max_depth=d)
            name = f"{name} + depth {d}"

            results_df = cv_and_display(
                preprocessor, model, name, multiclass_results, fine_grained=False
            )
    else:
        results_df = cv_and_display(
            preprocessor, model, name, multiclass_results, fine_grained=False
        )

display(pd.DataFrame(multiclass_results))

In [None]:
# loop through models and number of features to find the best combination
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

multiclass_feats = defaultdict(list)
for (name, model) in classifiers.items():
    for i in tqdm(
        range(1, len(feat_names)), desc=f"{name} Feature Selection", dynamic_ncols=True
    ):
        if name == "DecisionTree":
            for d in range(1, 9):
                model = DecisionTreeClassifier(random_state=123, max_depth=d)
                name = f"{name} + depth {d}"

                results_df, feats = perform_sfs_cv_and_display(
                    preprocessor,
                    model,
                    name,
                    multiclass_results,
                    n_features=i,
                    direction="forward",
                    fine_grained=False,
                )

                multiclass_feats[name].append(feats)
        else:
            results_df, feats = perform_sfs_cv_and_display(
                preprocessor,
                model,
                name,
                multiclass_results,
                n_features=i,
                direction="forward",
                fine_grained=False,
            )

            multiclass_feats[name].append(feats)

In [None]:
# sort by Validation_score and display
display(pd.DataFrame(multiclass_results).sort_values("Validation_score", 1, ascending=False))