# WARNING: THIS NOTEBOOK TAKES HOURS TO COMPLETELY RUN. DO NOT RUN UNLESS YOU ABSOLUTELY NECESSARY!!

<br>

# Feature Selection for Tree-Based Models
<br>

### Imports

In [2]:
import time
import json
from tqdm import tqdm
from itertools import compress
from collections import defaultdict
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import (
    SequentialFeatureSelector,
)  # requires sklearn 0.24 and above
from features import feature_pipeline
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Data Loading

In [3]:
with open("../data/train_features.json", "r", encoding="utf-8") as f:
    train = pd.DataFrame(json.load(f))
with open("../data/val_features.json", "r", encoding="utf-8") as f:
    val = pd.DataFrame(json.load(f))
with open("../data/test_features.json", "r", encoding="utf-8") as f:
    test = pd.DataFrame(json.load(f))

# load X and y DataFrames
X_train = train.drop(["preprocessed_text", "level"], axis=1)
y_train = train["level"].tolist()
X_val = val.drop(["preprocessed_text", "level"], axis=1)
y_val = val["level"].tolist()
X_test = test.drop(["preprocessed_text", "level"], axis=1)
y_test = test["level"].tolist()

# Convert 3-class labels to binary labels
y_train_binary = []
for lvl in train["level"].tolist():
    y_train_binary.append(lvl[0])
y_val_binary = []
for lvl in val["level"].tolist():
    y_val_binary.append(lvl[0])
y_test_binary = []
for lvl in test["level"].tolist():
    y_test_binary.append(lvl[0])

In [4]:
print(y_val)

['B', 'A1', 'B', 'A2', 'A2', 'A1', 'A1', 'A1', 'B', 'A2', 'B', 'B', 'B', 'B', 'A2', 'A1', 'B', 'B', 'A1', 'A1', 'B', 'B', 'B', 'A1', 'A2', 'A2', 'A1', 'A1', 'B', 'A1', 'B', 'B']


In [5]:
print(y_val_binary)

['B', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'B', 'B', 'B', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'B', 'B']


In [6]:
feat_names = list(X_train.columns)
print(f"All features:\n{feat_names}\n\nNumber of features: {len(feat_names)}")

scoring = accuracy_score

All features:
['total_tokens', 'total_tokens_w/o_stopwords', 'avg_sent_length', 'proportion_of_A_level_tokens', 'proportion_of_A_level_types', 'num_connectives', 'logical_operator_density', 'pronoun_density', 'type_token_ratio', 'avg_rank_of_lemmas_in_freq_list', 'fernandez_huerta_score', 'syllables_per_sentence', 'avg_degree_of_abstraction', 'min_degree_of_abstraction', 'avg_ambiguation_all_words', 'avg_ambiguation_content_words', 'noun_phrase_density', 'avg_parse_tree_depth', 'Fut', 'Imp', 'Past', 'Pres', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'EOL', 'SPACE', 'CONTENT', 'FUNCTION']

Number of features: 44


In [7]:
X_train.head()

Unnamed: 0,total_tokens,total_tokens_w/o_stopwords,avg_sent_length,proportion_of_A_level_tokens,proportion_of_A_level_types,num_connectives,logical_operator_density,pronoun_density,type_token_ratio,avg_rank_of_lemmas_in_freq_list,...,PROPN,PUNCT,SCONJ,SYM,VERB,X,EOL,SPACE,CONTENT,FUNCTION
0,2585,1118,35.410959,0.339893,0.18255,18,0.050386,0.047407,0.360155,684.322631,...,0.011605,0.109865,0.032882,0.0,0.07853,0.0,0.0,0.0,0.635376,0.364624
1,1539,622,14.941748,0.326367,0.214612,18,0.03497,0.067268,0.388564,629.897336,...,0.010396,0.152697,0.038337,0.005198,0.107862,0.0,0.0,0.0,0.621914,0.378086
2,159,70,22.714286,0.557143,0.395833,7,0.039216,0.143885,0.578616,611.383648,...,0.012579,0.075472,0.025157,0.0,0.169811,0.0,0.0,0.0,0.673469,0.326531
3,291,117,22.384615,0.307692,0.25,9,0.043011,0.024648,0.580756,547.487973,...,0.013746,0.189003,0.034364,0.0,0.089347,0.0,0.0,0.003436,0.629787,0.370213
4,401,193,21.105263,0.295337,0.23125,5,0.046997,0.028205,0.551122,528.177057,...,0.01995,0.164589,0.022444,0.0,0.109726,0.0,0.0,0.004988,0.654655,0.345345


### Set up Functions

In [8]:
def cross_validate(pipeline, X_train, y_train, X_val, y_val):
    """
    Fit a pipeline object on training data, and report training score,
    validation score, train time and prediction time.

    pipeline: (sklearn.Pipeline) Pipeline object
    X_train: (pandas.DataFrame) Training feature matrix
    y_train: (list) Training labels
    X_val: (pandas.DataFrame) Validation feature matrix
    y_val: (list) Validation labels

    return: (dict{float}) Dictionary of output results
    """
    start = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - start

    train_score = pipeline.score(X_train, y=y_train)

    start = time.time()
    val_score = pipeline.score(X_val, y=y_val)
    val_pred_time = time.time() - start

    return {
        "Training_score": train_score,
        "Validation_score": val_score,
        "Training_time": train_time,
        "Prediction_time": val_pred_time,
    }

In [9]:
def cv_and_display(preprocessor, model, name, results_df, fine_grained=False):
    """
    Train a model pipeline and return the cross validation results.

    preprocessor: (sklearn.ColumnTransformer) sklearn object for feature transformation
    model: (sklearn.Classifier) Initialized sklearn classifier
    name: (str) Name that is shown when the result is displayed
    results_df: (dict) Dictionary to store cross-validation results
    fine_grained: (bool) If True, model trains with 3-class classification instead of 2. Default is False

    return: (dict) Dictionary of cross-validation results
    """
    # Create pipeline
    pipeline = make_pipeline(preprocessor, model)

    # Binary or 3-class classification
    if fine_grained:
        y_t = y_train
        y_v = y_val
    else:
        y_t = y_train_binary
        y_v = y_val_binary

    # Run cross-validation
    scores = cross_validate(pipeline, X_train, y_t, X_val, y_v)

    # Store to results dictionary
    results_df[name] = scores

    return results_df

### Forward feature selection
Forward feature selection (greedy) is the feature selection process accomplished through sklearn's `SequentialFeatureSelector` function. It determines the top _k_ features (_k_ is a variable) using feature importance determined by the model.

In [10]:
def perform_sfs_cv_and_display(
    preprocessor,
    model,
    name,
    results_df,
    n_features,
    direction,
    fine_grained=False,
):
    """
    Train a forward feature selection pipeline for the given model and
    return the cross validation results.

    preprocessor: (sklearn.ColumnTransformer) sklearn object for feature transformation
    model: (sklearn.Classifier) Initialized sklearn classifier
    name: (str) Name that is shown when the result is displayed
    results_df: (dict) Dictionary to store cross-validation results
    n_features: (int) Number of features to select with SequentialFeatureSelector
    direction: (str) {'forward', 'backward'} Forward or backward feature selection
    fine_grained: (bool) If True, model trains with 3-class classification instead of 2. Default is False

    return:
        (dict) Dictionary of cross-validation results
        (list) List of selected best features
    """
    # Initialize feature selector
    sfs = SequentialFeatureSelector(
        model,
        n_features_to_select=n_features,
        scoring="accuracy",
        direction=direction,
        cv=2,
        n_jobs=-1,
    )

    # Create feature selection pipeline with preprocessor and model
    sfs_pipeline = make_pipeline(preprocessor, sfs, model)

    # Binary or 3-class classification
    if fine_grained:
        y_t = y_train
        y_v = y_val
    else:
        y_t = y_train_binary
        y_v = y_val_binary

    # Fit pipeline
    sfs_pipeline.fit(X_train, y_t)

    # Features selected
    feats_selected = list(compress(X_train.columns, sfs_pipeline[1].get_support()))

    # Subset data for selected features
    cv_X_train = X_train[feats_selected]
    cv_X_val = X_val[feats_selected]

    # Create CV pipeline with selected features
    cv_preprocessor = make_column_transformer((StandardScaler(), feats_selected))
    pipeline = make_pipeline(cv_preprocessor, model)

    # Run cross-validation
    scores = cross_validate(pipeline, cv_X_train, y_t, cv_X_val, y_v)

    # Store to results dictionary
    results_df[f"{name} + {n_features} features"] = scores

    return results_df, feats_selected

<br>



## Train 5 baseline tree models for binary classification using the full set of features

In [10]:
# Dictionary for storing CV results
binary_results = {}

# Initialize preprocessor
preprocessor = make_column_transformer((StandardScaler(), feat_names))

# Dictionary of all classifiers
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
    "XGBoost": XGBClassifier(random_state=123),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=123),
}

for (name, model) in classifiers.items():
    # Iterate over 8 depths if classifier is DecisionTree
    if name == "DecisionTree":
        for d in range(1, 9):
            model = DecisionTreeClassifier(random_state=123, max_depth=d)
            _name = f"{name} + depth {d}"

            results_df = cv_and_display(
                preprocessor, model, _name, binary_results, fine_grained=False
            )
    else:
        results_df = cv_and_display(
            preprocessor, model, name, binary_results, fine_grained=False
        )

display(pd.DataFrame(binary_results))





Unnamed: 0,DecisionTree + depth 1,DecisionTree + depth 2,DecisionTree + depth 3,DecisionTree + depth 4,DecisionTree + depth 5,DecisionTree + depth 6,DecisionTree + depth 7,DecisionTree + depth 8,RandomForest,LightGBM,XGBoost,CatBoost
Training_score,0.836576,0.863813,0.88716,0.922179,0.953307,0.984436,0.992218,0.996109,1.0,1.0,1.0,1.0
Validation_score,0.84375,0.875,0.875,0.75,0.75,0.8125,0.8125,0.75,0.8125,0.84375,0.8125,0.84375
Training_time,0.202458,0.08278,0.076802,0.098733,0.077797,0.061816,0.082778,0.095753,0.543545,0.782898,1.742374,13.19571
Prediction_time,0.023942,0.02194,0.02492,0.032917,0.021973,0.021931,0.027924,0.023925,0.102721,0.033911,0.034905,0.008976


### Loop through the 3 fastest models and the entire set of features to find the best combination of features

In [11]:
# Dictionary of classifiers to run feature selection on
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

# Dictionary of best selected features
binary_feats = defaultdict(list)

for (name, model) in classifiers.items():
    # Iterate over all features
    for i in tqdm(
        range(1, len(feat_names)), desc=f"{name} Feature Selection", dynamic_ncols=True
    ):
        # Iterate over 8 depths if classifier is DecisionTree
        if name == "DecisionTree":
            for d in range(1, 9):
                model = DecisionTreeClassifier(random_state=123, max_depth=d)
                _name = f"{name} + depth {d}"

                # Run feature selection and CV
                results_df, feats = perform_sfs_cv_and_display(
                    preprocessor,
                    model,
                    _name,
                    binary_results,
                    n_features=i,
                    direction="forward",
                    fine_grained=False,
                )

                # Store best features to dictionary
                binary_feats[name].append(feats)
        else:
            results_df, feats = perform_sfs_cv_and_display(
                preprocessor,
                model,
                name,
                binary_results,
                n_features=i,
                direction="forward",
                fine_grained=False,
            )

            binary_feats[name].append(feats)

DecisionTree Feature Selection: 100%|██████████████████████████████████████| 43/43 [30:22<00:00, 42.38s/it]
RandomForest Feature Selection: 100%|███████████████████████████████████| 43/43 [2:08:04<00:00, 178.70s/it]
LightGBM Feature Selection: 100%|██████████████████████████████████████████| 43/43 [15:03<00:00, 21.02s/it]


In [12]:
# Sort by Validation_score and display
display(
    pd.DataFrame(binary_results).sort_values("Validation_score", 1, ascending=False)
)

Unnamed: 0,DecisionTree + depth 8 + 4 features,DecisionTree + depth 8 + 3 features,DecisionTree + depth 4 + 2 features,DecisionTree + depth 4 + 4 features,DecisionTree + depth 8 + 30 features,DecisionTree + depth 4 + 3 features,DecisionTree + depth 8 + 37 features,DecisionTree + depth 7 + 24 features,RandomForest + 10 features,DecisionTree + depth 2 + 39 features,...,DecisionTree + depth 8 + 20 features,DecisionTree + depth 6 + 13 features,DecisionTree + depth 7 + 35 features,DecisionTree + depth 8 + 18 features,DecisionTree + depth 5 + 15 features,DecisionTree + depth 8 + 17 features,DecisionTree + depth 5 + 17 features,DecisionTree + depth 5 + 19 features,DecisionTree + depth 6 + 14 features,DecisionTree + depth 5 + 40 features
Training_score,0.972763,0.937743,0.88716,0.898833,0.976654,0.891051,1.0,0.992218,1.0,0.863813,...,0.992218,0.976654,0.996109,0.996109,0.949416,0.996109,0.949416,0.953307,0.976654,0.957198
Validation_score,0.9375,0.9375,0.90625,0.90625,0.90625,0.90625,0.90625,0.90625,0.90625,0.875,...,0.71875,0.71875,0.71875,0.71875,0.71875,0.71875,0.6875,0.6875,0.6875,0.6875
Training_time,0.009972,0.011968,0.010971,0.005985,0.009974,0.007977,0.011966,0.008975,0.186503,0.008974,...,0.012964,0.008975,0.016954,0.009973,0.007977,0.006981,0.010971,0.010969,0.007978,0.009973
Prediction_time,0.005985,0.004987,0.005984,0.002995,0.00299,0.005983,0.00299,0.002992,0.013962,0.003989,...,0.003987,0.003989,0.00399,0.004985,0.004987,0.003991,0.002991,0.003991,0.004988,0.00698


### Best model and best features

In [13]:
print("Best model: Decision Tree of depth 8 with 4 features")
print(f"Selected features:\n{binary_feats['DecisionTree'][4*8-1]}")

Best model: Decision Tree of depth 8 with 4 features
Selected features:
['avg_sent_length', 'syllables_per_sentence', 'avg_parse_tree_depth', 'SYM']


### Evaluate best model on test data

In [14]:
selected_features = [
    "avg_sent_length",
    "syllables_per_sentence",
    "avg_parse_tree_depth",
    "SYM",
]
best_model = DecisionTreeClassifier(random_state=123, max_depth=8)
X_train_selected = X_train[selected_features]  # Select train data
X_test_selected = X_test[selected_features]  # Select test data
# cv with selected features
preprocessor = make_column_transformer((StandardScaler(), selected_features))
pipeline = make_pipeline(preprocessor, best_model)
# Train the model
pipeline.fit(X_train_selected, y_train_binary)

predicted_y_test_binary = pipeline.predict(X_test_selected)
print("Test accuracy %0.3f" % (accuracy_score(y_test_binary, predicted_y_test_binary)))
print("Test Set Classification Report:")
print(classification_report(y_test_binary, predicted_y_test_binary))

Test accuracy 0.844
Test Set Classification Report:
              precision    recall  f1-score   support

           A       0.93      0.76      0.84        17
           B       0.78      0.93      0.85        15

    accuracy                           0.84        32
   macro avg       0.85      0.85      0.84        32
weighted avg       0.86      0.84      0.84        32



<br>



## Train 5 baseline tree models for 3-class classification using the full set of features

In [11]:
# Dictionary for storing CV results
multiclass_results = {}

# Initialize preprocessor
preprocessor = make_column_transformer((StandardScaler(), feat_names))

# Dictionary of all classifiers
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
    "XGBoost": XGBClassifier(random_state=123),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=123),
}

for (name, model) in classifiers.items():
    # Iterate over 8 depths if classifier is DecisionTree
    if name == "DecisionTree":
        for d in range(1, 9):
            model = DecisionTreeClassifier(random_state=123, max_depth=d)
            _name = f"{name} + depth {d}"

            results_df = cv_and_display(
                preprocessor, model, _name, multiclass_results, fine_grained=True
            )
    else:
        results_df = cv_and_display(
            preprocessor, model, name, multiclass_results, fine_grained=True
        )

display(pd.DataFrame(multiclass_results))





Unnamed: 0,DecisionTree + depth 1,DecisionTree + depth 2,DecisionTree + depth 3,DecisionTree + depth 4,DecisionTree + depth 5,DecisionTree + depth 6,DecisionTree + depth 7,DecisionTree + depth 8,RandomForest,LightGBM,XGBoost,CatBoost
Training_score,0.715953,0.754864,0.785992,0.859922,0.906615,0.964981,0.988327,1.0,1.0,1.0,1.0,1.0
Validation_score,0.75,0.75,0.75,0.71875,0.65625,0.6875,0.65625,0.65625,0.75,0.6875,0.625,0.75
Training_time,0.023933,0.014962,0.010967,0.023936,0.02134,0.037897,0.010868,0.015623,0.27126,0.291236,0.854683,26.44714
Prediction_time,0.006978,0.006984,0.015957,0.013963,0.017633,0.008932,0.0,0.015622,0.015621,0.0,0.015622,0.015621


### Loop through the 3 fastest models and the entire set of features to find the best combination of features

In [12]:
# Dictionary of classifiers to run feature selection on
classifiers = {
    "DecisionTree": DecisionTreeClassifier(random_state=123, max_depth=8),
    "RandomForest": RandomForestClassifier(random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

# Dictionary of best selected features
multiclass_feats = defaultdict(list)

for (name, model) in classifiers.items():
    # Iterate over all features
    for i in tqdm(
        range(1, len(feat_names)), desc=f"{name} Feature Selection", dynamic_ncols=True
    ):
        # Iterate over 8 depths if classifier is DecisionTree
        if name == "DecisionTree":
            for d in range(1, 9):
                model = DecisionTreeClassifier(random_state=123, max_depth=d)
                _name = f"{name} + depth {d}"

                # Run feature selection and CV
                results_df, feats = perform_sfs_cv_and_display(
                    preprocessor,
                    model,
                    _name,
                    multiclass_results,
                    n_features=i,
                    direction="forward",
                    fine_grained=True,
                )

                # Store best features to dictionary
                multiclass_feats[name].append(feats)
        else:
            results_df, feats = perform_sfs_cv_and_display(
                preprocessor,
                model,
                name,
                multiclass_results,
                n_features=i,
                direction="forward",
                fine_grained=True,
            )

            multiclass_feats[name].append(feats)

DecisionTree Feature Selection: 100%|██████████████████████████████████████| 43/43 [37:14<00:00, 51.95s/it]
RandomForest Feature Selection: 100%|███████████████████████████████████| 43/43 [2:11:37<00:00, 183.65s/it]
LightGBM Feature Selection: 100%|██████████████████████████████████████████| 43/43 [34:51<00:00, 48.65s/it]


In [13]:
# Sort by Validation_score and display
display(
    pd.DataFrame(multiclass_results).sort_values("Validation_score", 1, ascending=False)
)

Unnamed: 0,LightGBM + 33 features,LightGBM + 38 features,LightGBM + 37 features,LightGBM + 35 features,LightGBM + 24 features,LightGBM + 29 features,DecisionTree + depth 4 + 5 features,DecisionTree + depth 5 + 26 features,DecisionTree + depth 4 + 6 features,DecisionTree + depth 4 + 13 features,...,DecisionTree + depth 8 + 14 features,DecisionTree + depth 7 + 17 features,DecisionTree + depth 7 + 11 features,DecisionTree + depth 8 + 26 features,DecisionTree + depth 8 + 27 features,DecisionTree + depth 6 + 3 features,DecisionTree + depth 7 + 18 features,DecisionTree + depth 7 + 10 features,DecisionTree + depth 7 + 9 features,DecisionTree + depth 8 + 23 features
Training_score,1.0,1.0,1.0,1.0,1.0,1.0,0.832685,0.875486,0.832685,0.832685,...,0.968872,0.972763,0.968872,1.0,1.0,0.883268,0.972763,0.968872,0.968872,1.0
Validation_score,0.8125,0.8125,0.8125,0.8125,0.8125,0.8125,0.78125,0.78125,0.78125,0.78125,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.46875
Training_time,0.243261,0.269657,0.258646,0.253862,0.225428,0.236346,0.015661,0.0,0.015624,0.0,...,0.004046,0.016853,0.007352,0.007978,0.0,0.015656,0.015697,0.027638,0.006652,0.015622
Prediction_time,0.007247,0.008123,0.008116,0.008235,0.008207,0.009271,0.0,0.015619,0.0,0.007421,...,0.0,0.015655,0.002295,0.002992,0.015622,0.0,0.01694,0.015692,0.001997,0.0


### Best model and best features

In [14]:
print("Best model: LightGBM with 24 features")
print(f"Selected features:\n{multiclass_feats['LightGBM'][24-1]}")

Best model: LightGBM with 24 features
Selected features:
['avg_sent_length', 'pronoun_density', 'fernandez_huerta_score', 'syllables_per_sentence', 'avg_degree_of_abstraction', 'min_degree_of_abstraction', 'avg_ambiguation_all_words', 'avg_ambiguation_content_words', 'avg_parse_tree_depth', 'Past', 'AUX', 'CONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON', 'PUNCT', 'SYM', 'VERB', 'X', 'EOL', 'SPACE', 'FUNCTION']


### Evaluate best model on test data

In [15]:
selected_features = [
    "avg_sent_length",
    "pronoun_density",
    "fernandez_huerta_score",
    "syllables_per_sentence",
    "avg_degree_of_abstraction",
    "min_degree_of_abstraction",
    "avg_ambiguation_all_words",
    "avg_ambiguation_content_words",
    "avg_parse_tree_depth",
    "Past",
    "AUX",
    "CONJ",
    "DET",
    "NOUN",
    "NUM",
    "PART",
    "PRON",
    "PUNCT",
    "SYM",
    "VERB",
    "X",
    "EOL",
    "SPACE",
    "FUNCTION",
]
best_model = LGBMClassifier(random_state=123)
X_train_selected = X_train[selected_features]  # Select train data
X_test_selected = X_test[selected_features]  # Select test data
# cv with selected features
preprocessor = make_column_transformer((StandardScaler(), selected_features))
pipeline = make_pipeline(preprocessor, best_model)
# Train the model
pipeline.fit(X_train_selected, y_train)

predicted_y_test = pipeline.predict(X_test_selected)
print("Test accuracy %0.3f" % (accuracy_score(y_test, predicted_y_test)))
print("Test Set Classification Report:")
print(classification_report(y_test, predicted_y_test))

Test accuracy 0.625
Test Set Classification Report:
              precision    recall  f1-score   support

          A1       0.62      0.45      0.53        11
          A2       0.29      0.33      0.31         6
           B       0.76      0.87      0.81        15

    accuracy                           0.62        32
   macro avg       0.56      0.55      0.55        32
weighted avg       0.63      0.62      0.62        32



<br>

## Save the predictions of the best model to a JSON file

In [16]:
output_dict = {"test_text": [], "prediction": [], "gold": []}
for i in range(len(y_test)):
    output_dict["test_text"].append(test["preprocessed_text"][i])
    output_dict["prediction"].append(predicted_y_test[i])
    output_dict["gold"].append(y_test[i])

In [17]:
pd.DataFrame(output_dict).head()

Unnamed: 0,test_text,prediction,gold
0,capítulo —¡paren ya de pelearse! —el hombre al...,A1,A1
1,"¡es con voz de la biblia, o verso de walt whit...",B,B
2,los cuatro hermanos un zapatero tenía cuatro h...,A2,A2
3,una mañana entró un caballero en la tienda de ...,A1,A1
4,había un viejo que tenía una hija muy hermosa....,A2,A1


In [18]:
with open("../predictions/lightgbm_test_pred.json", "w", encoding="utf-8") as fout:
    json.dump(output_dict, fout)