In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from label_legends.preprocess import load_test, transform, holdout
from label_legends.result import calculate_scores, get_experiment, load_predictions, client, get_current
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoConfig, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import json
import torch
import mlflow
import polars as pl
import logging
import time
import numpy as np

logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test = transform(load_test().collect())
val, tra = holdout()
tra = transform(tra)
val = transform(val)



In [4]:
MODEL_NAME = 'distilbert-base-uncased'
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer =DistilBertTokenizer.from_pretrained(MODEL_NAME)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class SexistDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

train_texts = tra['text'].to_list()
train_labels = tra['label'].to_list()

val_texts = val['text'].to_list()
val_labels = val['label'].to_list()

test_texts = test['text'].to_list()
test_labels = test['label'].to_list()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt",)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt",)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt",)

train_dataset = SexistDataset(train_encodings, train_labels)
val_dataset = SexistDataset(val_encodings, val_labels)
test_dataset = SexistDataset(test_encodings, test_labels)

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define the Training Arguments
training_args = TrainingArguments(
    output_dir='./results_distil',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    # weight_decay=0.01,
    logging_dir='./logs',
    # save_total_limit=2,
    evaluation_strategy="epoch",
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



In [8]:
start_time = time.time()
train_output = trainer.train()  # Returns a TrainOutput object
training_time_manual = time.time() - start_time
train_metrics = train_output.metrics  # Contains train_runtime, train_loss, etc.

# Assume trainer.evaluate() has been run and returned results
eval_results = trainer.evaluate()  # Contains eval_runtime, eval_accuracy, etc.

# Combine training and evaluation metrics
#all_metrics = {**train_metrics, **eval_results}
metrics = {**eval_results}

# Create a Polars DataFrame
df = pl.DataFrame(metrics)
df

  0%|          | 2/11025 [01:48<165:58:23, 54.21s/it]
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


KeyboardInterrupt: 

In [None]:
predicted_test = trainer.predict(test_dataset)

y_pred = np.argmax(predicted_test.predictions, axis=1)
y_true = predicted_test.label_ids

In [None]:

conf_matrix = confusion_matrix(y_true, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot(cmap=plt.cm.Blues)

plt.title("Confusion Matrix")
plt.show()

In [None]:
scores = calculate_scores(y_true, y_pred)
print(scores)

In [None]:
mlflow.end_run()
with mlflow.start_run(
        experiment_id=get_experiment().experiment_id, 
        run_name="distilbert testing"
        ):
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("learning_rate", 5e-5)

    mlflow.log_metrics(scores.asdict())
    mlflow.set_tag("model", "distilbert")

    mlflow.log_table({"id": test["id"], "prediction": y_pred}, "predictions_distilbert.json")

    #log the model
    prediction_data = {
        "columns": ["id", "prediction"],
        "data": [[id_, pred.tolist()] for id_, pred in zip(test["id"], y_pred)]
    }
    # Save to a JSON file
    predictions_file = r"../resource/mlflow/distilbert/predictions.json"
    with open(predictions_file, "w") as f:
        json.dump(prediction_data, f, indent=4)
        
    # Log the JSON file as an artifact
    mlflow.log_artifact(predictions_file)
        
    # Simulate model registration
    mlflow.set_tag("registered_model_name", "distilbert")
    mlflow.set_tag("model_alias", "current")

    # Register the model
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/predictions_distilbert.json"
    registered_model = mlflow.register_model(model_uri=model_uri, name="distilbert") 


client().set_registered_model_alias("distilbert", "current", registered_model.version)