In [None]:
import pandas as pd
from sklearn.metrics import ConfusionMatrixDisplay, f1_score
from lightgbm import LGBMClassifier

In [None]:
df = pd.read_parquet("data/california_sits_bert_original.parquet")[["id", "label", "use_bert"]].groupby("id").first().reset_index(drop=True)
features = pd.read_parquet("data/california_sits_bert_features.parquet")

train_columns = features.columns.tolist()
df = pd.concat([df, features], axis=1)

del features

In [None]:
model = LGBMClassifier(verbosity=-1, n_jobs=-1)
model.fit(df[df.use_bert!=2][train_columns].to_numpy(), df[df.use_bert!=2].label.to_numpy())

In [None]:
# Plot confusion matrix with f1 score as title. Only one decimal is shown
y_pred = model.predict(df[df.use_bert==2][train_columns].to_numpy())
f1 = f1_score(df[df.use_bert==2].label, y_pred, average="macro")
disp = ConfusionMatrixDisplay.from_predictions(df[df.use_bert==2].label, y_pred, normalize="pred")
disp.ax_.set_title(f"f1 score: {f1}")

In [None]:
# Create dataframe with feature importances, sorted by importance, showing the first 20 most important features
importances = pd.DataFrame(
    {"feature": train_columns, "importance": model.feature_importances_}
)
importances = importances.sort_values("importance", ascending=False)

importances.head(20)