In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from mlflow.models import infer_signature
from mlflow.models.model import ModelInfo
from label_legends.preprocess import holdout, load_test, tfidf_pipeline, token_lists, transform, load_test
from xgboost import XGBClassifier
import logging
import numpy as np
import mlflow
import smac
from smac import HyperparameterOptimizationFacade, Scenario
from ConfigSpace import ConfigurationSpace, Configuration
from label_legends.util import SEED

from label_legends.result import calculate_scores, download_predictions, get_experiment, load_predictions, client, get_current

np.set_printoptions(threshold=1000)
logging.basicConfig(level=logging.INFO)

# Load dataset

In [10]:
val, tra = holdout()
tra = transform(tra)
val = transform(val)

test = transform(load_test().collect())

In [11]:
tra.head()

id,text,tokens,token_ids,label
i64,str,list[str],list[i64],i64
26016,"""#VoteDemOut seditionist wanna …","[""#"", ""votedemout"", … ""ven""]","[7, 3000, … 3000]",0
41060,"""Cerno hops from wave to wave: …","[""cerno"", ""hop"", … "".""]","[3000, 3000, … 26]",1
35766,"""think about it.. he´s called J…","[""think"", "".."", … ""elect""]","[2690, 27, … 912]",0
23678,"""Hmm, you could rewrite this wi…","[""hmm"", "","", … ""justsaying""]","[1286, 22, … 3000]",0
52396,"""Every girl in a game, or on si…","[""girl"", ""game"", … "".""]","[1166, 1137, … 26]",0


# Generate Features
As we are using a traditional machine learning approach, we have to create features. For XGBoost we are creating tf-idf, representing a value for each word in the text which is also in the vocabulary.
We are reusing the already created vocabulary and with our corpus we are creating the tf-idf values.

In [12]:
tfidf_pipeline().fit(token_lists(tra))



In [26]:
X_train = tfidf_pipeline().transform(token_lists(tra))
X_test = tfidf_pipeline().transform(token_lists(test))
y_true = test["label"].to_list()

# Fit model and predict

Create a XGBClassifier object, fit it to the training data, optimize its hyperparameters based on the validation results.
Once we decided on a parameter configuration, we test the model against the test set.

In [36]:
def train(config: Configuration, seed: int = 0):
    clf = XGBClassifier(
        booster=config["booster"], 
        eta=config["eta"], 
        gamma=config["gamma"], 
        max_depth=config["max_depth"], 
        seed=seed,
        device="cpu")
    clf.fit(X_train, tra.select("label").to_series().to_torch())
    y_pred = clf.predict(X_test)
    scores = calculate_scores(y_true, y_pred)
    return scores.fscore

In [44]:
configspace = ConfigurationSpace({
    "booster": ["gbtree", "gblinear", "dart"], 
    "eta": (0.1, 0.5),
    "gamma": (0.0, 1000.0),
    "max_depth": (3, 15)
})
scenario = Scenario(configspace, deterministic=True, n_trials=10, seed=SEED, walltime_limit=2, max_budget=3)
smac = HyperparameterOptimizationFacade(scenario, train)
incumbent = smac.optimize()

[INFO][abstract_initial_design.py:95] Reducing the number of initial configurations from 40 to 2 (max_ratio == 0.25).


[INFO][abstract_initial_design.py:147] Using 2 initial design configurations and 0 additional configurations.


[INFO][abstract_intensifier.py:306] Using only one seed for deterministic scenario.


[INFO][abstract_intensifier.py:516] Added config b5b119 as new incumbent because there are no incumbents yet.


KeyboardInterrupt: 

In [38]:
incumbent

Configuration(values={
  'booster': 'gbtree',
  'eta': 0.4338182199746,
  'gamma': 52.7922287583351,
  'max_depth': 3,
})

In [10]:
clf = XGBClassifier()
clf.fit(X_train, tra["label"].to_torch())
y_pred = clf.predict(X_test)
scores = calculate_scores(y_true, y_pred)
print(scores)

precision:	0.7639
recall:		0.3884
fscore:		0.5150
accuracy:	0.8101
tn: 8511	 fp: 374
fn: 1905	 tp: 1210


# Report model to MLflow

In [28]:
with mlflow.start_run(
        experiment_id=get_experiment().experiment_id, 
        run_name="xgboost testing"
        ):
    mlflow.log_params(clf.get_xgb_params())
    mlflow.log_metrics(scores.asdict())
    mlflow.set_tag("model", "XGBoost")
    signature = infer_signature(X_train, clf.predict(X_train))
    mlflow.log_table({"id": test["id"], "prediction": y_pred}, "predictions.json")

    # log the model
    model_info: ModelInfo = mlflow.xgboost.log_model(
        clf,
        "",
        signature=signature,
        registered_model_name="xgboost",
    )
client().set_registered_model_alias("xgboost", "current", model_info.registered_model_version)



Registered model 'xgboost' already exists. Creating a new version of this model...
Created version '15' of model 'xgboost'.


In [35]:
get_current("xgboost")

<ModelVersion: aliases=['current'], creation_timestamp=1734021320202, current_stage='None', description=None, last_updated_timestamp=1734021320202, name='xgboost', run_id='d70eb3e017dc448ba2892b095b2f9783', run_link=None, source='/home/lukas/Programming/uni/nlp-ie-label-legends/mlruns/label-legends/d70eb3e017dc448ba2892b095b2f9783/artifacts', status='READY', status_message=None, tags={}, user_id=None, version=15>

In [42]:
# Download the most recent predictions of the model
download_predictions("xgboost")

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1860.83it/s]




In [41]:
load_predictions("xgboost").head()

id,prediction
i64,i64
100,0
10005,1
10006,1
10007,1
10008,0
