In [24]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [25]:
!nvidia-smi

Thu May 16 09:39:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [26]:
!pip install datasets evaluate accelerate



In [27]:
import pandas as pd
from datasets import Dataset
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    AutoTokenizer,
    PreTrainedTokenizer,
    DataCollatorWithPadding
)
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from typing import Dict, Any, Callable, Tuple
from datetime import datetime
import torch

In [28]:
def print_confusion_matrix(actual: np.array, predictions: np.array) -> None:
    """
    Print the confusion matrix along with precision, recall, and accuracy metrics.

    Args:
        actual (np.ndarray): Array of actual target values.
        predictions (np.ndarray): Array of predicted target values.
    """
    cm = confusion_matrix(actual, predictions)

    classes = ['Neg', 'Pos']

    print('            Predicted:')
    print('            | {:<4} | {:<4} |'.format(*classes))
    print('-----------------------------')
    for i, row in enumerate(cm):
        print('Actual {:<4} | {:<4} | {:<4} |'.format(classes[i], *row))
    print()

    precision = precision_score(actual, predictions, average='binary')
    recall = recall_score(actual, predictions, average='binary')
    accuracy = accuracy_score(actual, predictions)

    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Accuracy: {accuracy:.3f}")



def tokenize(df: pd.DataFrame, tokenizer: PreTrainedTokenizer) -> Dict[str, Any]:
    """Preprocesses the text data in a DataFrame using a specified tokenizer.

    Args:
        df (pd.DataFrame): A DataFrame containing a column 'text' with text data to be tokenized.
        tokenizer (PreTrainedTokenizer): A tokenizer instance from the transformers library.

    Returns:
        Dict[str, Any]: A dictionary containing tokenized representations of the text data.
    """
    return tokenizer(df["text"], truncation=True, max_length=512)


def create_compute_metrics() -> Callable[[Tuple[np.ndarray, np.ndarray]], Dict[str, float]]:
    """
    Creates a compute_metrics function that computes accuracy.

    Returns:
        Callable[[Tuple[np.ndarray, np.ndarray]], Dict[str, float]]: A function that computes accuracy
        for predictions and labels.
    """
    accuracy_metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
        """
        Computes accuracy for predictions and labels.

        Args:
            eval_pred (Tuple[np.ndarray, np.ndarray]): A tuple containing predictions and labels.

        Returns:
            Dict[str, float]: A dictionary with the accuracy score.
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy_metric.compute(predictions=predictions, references=labels)

    return compute_metrics


def get_current_datetime() -> str:
    """Get the current date and time in the format yyyymmdd-hhmmss."""
    current_datetime = datetime.now()
    formatted_datetime = current_datetime.strftime("%Y%m%d-%H%M")
    return formatted_datetime

In [None]:
import os
os.environ["TRANSFORMERS_VERBOSITY"] = "info"

In [37]:
# ASF 7, AHL and AFS 5 epochs
MISSION = "AHL"
EPOCHS = 5

In [38]:
# Set paths
afs_train_path = '/content/drive/My Drive/Colab Notebooks/afs_train.csv'
afs_val_path = '/content/drive/My Drive/Colab Notebooks/afs_val.csv'
ahl_train_path = '/content/drive/My Drive/Colab Notebooks/ahl_train.csv'
ahl_val_path = '/content/drive/My Drive/Colab Notebooks/ahl_val.csv'
asf_train_path = '/content/drive/My Drive/Colab Notebooks/asf_train.csv'
asf_val_path = '/content/drive/My Drive/Colab Notebooks/asf_val.csv'

In [39]:
# Load datasets
if MISSION == "AHL":
  train_df = pd.read_csv(ahl_train_path)
  val_df = pd.read_csv(ahl_val_path)
elif MISSION == "AFS":
  train_df = pd.read_csv(afs_train_path)
  val_df = pd.read_csv(afs_val_path)
elif MISSION == "ASF":
  train_df = pd.read_csv(asf_train_path)
  val_df = pd.read_csv(asf_val_path)

In [40]:
# Create train and val HF datasets
train_hf_ds = Dataset.from_pandas(train_df.assign(label=lambda x: x.relevant))
val_hf_ds = Dataset.from_pandas(val_df.assign(label=lambda x: x.relevant))

In [41]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [42]:
# Tokenize train and val HF datasets
tokenized_train_hf_ds = train_hf_ds.map(lambda x: tokenize(x, tokenizer), batched=True)
tokenized_val_hf_ds = val_hf_ds.map(lambda x: tokenize(x, tokenizer), batched=True)

Map:   0%|          | 0/2918 [00:00<?, ? examples/s]

Map:   0%|          | 0/516 [00:00<?, ? examples/s]

In [43]:
# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define mappings between label IDs and label names
id2label = {0: "NOT RELEVANT", 1: "RELEVANT"}
label2id = {"NOT RELEVANT": 0, "RELEVANT": 1}

# Load the pre-trained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Retrieve the current date and time
date_time = get_current_datetime()

# Define a function to compute evaluation metrics during training.
compute_metrics = create_compute_metrics()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# Set training arguments
training_args = TrainingArguments(
    output_dir=f"/content/drive/My Drive/Colab Notebooks/{MISSION}_distillbert_{date_time}",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [45]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_hf_ds,
    eval_dataset=tokenized_val_hf_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.063661,0.98062
2,No log,0.035164,0.988372
3,No log,0.030208,0.988372
4,No log,0.027335,0.988372
5,No log,0.030912,0.988372


TrainOutput(global_step=460, training_loss=0.06775602257770041, metrics={'train_runtime': 719.7602, 'train_samples_per_second': 20.271, 'train_steps_per_second': 0.639, 'total_flos': 1932699346391040.0, 'train_loss': 0.06775602257770041, 'epoch': 5.0})

In [46]:
# Store number of steps
global_step = trainer.state.global_step

In [48]:
# Load model for inference classification
model_path = f"/content/drive/My Drive/Colab Notebooks/{MISSION}_distillbert_{date_time}/checkpoint-{global_step}"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, truncation=True, max_length=512)

In [49]:
%%time
# Make predictions on validation set
predictions = classifier(tokenized_val_hf_ds["text"])

CPU times: user 3min 29s, sys: 8.13 s, total: 3min 37s
Wall time: 3min 43s


In [51]:
# Make y_preds
val_y_preds = np.array([label2id[pred["label"]] for pred in predictions])

In [52]:
# Make y_actual
val_y_actual = val_df.relevant.values

In [53]:
# Compare actuals vs. predictions in confusion matrix
print_confusion_matrix(val_y_actual, val_y_preds)

            Predicted:
            | Neg  | Pos  |
-----------------------------
Actual Neg  | 247  | 4    |
Actual Pos  | 2    | 263  |

Precision: 0.985
Recall: 0.992
Accuracy: 0.988
