# Lightweight Fine-Tuning Project

In [21]:
!python --version

Python 3.13.5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [22]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
import numpy as np
import datetime

In [23]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("rtweera/customer_care_emails")

# Split the train set into train/validation
train_valid = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Inspect the dataset
print("Train sample:", train_valid["train"][0])
print("Validation sample:", train_valid["test"][0])

Train sample: {'subject': 'Re: URGENT: Mercury Language Integration with IAM Failing in Production! - Additional Information', 'sender': 'john.doe@example.com', 'receiver': 'support@aetheros.com', 'timestamp': '2023-10-27T15:22:58Z', 'message_body': 'Hi,\n\nPlease find the information you requested below:\n\n* Mercury Language runtime version: 1.2.3\n* Code snippet: [provided code snippet demonstrating IAM integration]\n* Error log: [provided error log excerpt]\n\nWe are still experiencing this critical issue and our application remains down. Please prioritize this and let us know as soon as you have any updates.\n\nThanks,\nJohn', 'thread_id': 'aa014-886cf4aa-113b-44d2-a288-82e4c47844a0', 'email_types': "['issue']", 'email_status': 'ongoing', 'email_criticality': 'high', 'product_types': "['Mercury Language', 'IAM service']", 'agent_effectivity': 'low', 'agent_efficiency': 'very low', 'customer_satisfaction': 0.014}
Validation sample: {'subject': 'Re: Suggestion for IAM role inheritan

In [24]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(examples):
    return tokenizer(examples["email_types"], truncation=True)

tokenized_train = train_valid["train"].map(tokenize, batched=True)
tokenized_test = train_valid["test"].map(tokenize, batched=True)


Map:   0%|          | 0/1807 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

In [25]:
# Define label mappings
# Preprocess labels: map 'email_types' strings to integers and rename to 'label'
label_list = sorted(set(train_valid['train']['email_types']))
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

def preprocess_labels(example):
    example['label'] = label2id[example['email_types']]
    return example

# Apply to train and test splits
train_valid['train'] = train_valid['train'].map(preprocess_labels)
train_valid['test'] = train_valid['test'].map(preprocess_labels)

# Check the result
print(train_valid['train'][0])

Map:   0%|          | 0/1807 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

{'subject': 'Re: URGENT: Mercury Language Integration with IAM Failing in Production! - Additional Information', 'sender': 'john.doe@example.com', 'receiver': 'support@aetheros.com', 'timestamp': '2023-10-27T15:22:58Z', 'message_body': 'Hi,\n\nPlease find the information you requested below:\n\n* Mercury Language runtime version: 1.2.3\n* Code snippet: [provided code snippet demonstrating IAM integration]\n* Error log: [provided error log excerpt]\n\nWe are still experiencing this critical issue and our application remains down. Please prioritize this and let us know as soon as you have any updates.\n\nThanks,\nJohn', 'thread_id': 'aa014-886cf4aa-113b-44d2-a288-82e4c47844a0', 'email_types': "['issue']", 'email_status': 'ongoing', 'email_criticality': 'high', 'product_types': "['Mercury Language', 'IAM service']", 'agent_effectivity': 'low', 'agent_efficiency': 'very low', 'customer_satisfaction': 0.014, 'label': 1}


In [27]:
# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Verify the model
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [29]:
# Freeze the model parameters
for param in model.base_model.parameters():
    param.requires_grad = False

# Print parameters
total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_params:,} total parameters, including {total_trainable_params:,} trainable parameters.")


66,956,548 total parameters, including 593,668 trainable parameters.


In [30]:
# Prepare for training
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

tokenized_train = train_valid["train"].map(tokenize, batched=True)
tokenized_test = train_valid["test"].map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer, padding=True)
training_args = TrainingArguments(
    output_dir=f"./results/{model_name}/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=550,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch", 
    save_total_limit=1,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print(tokenized_test)

Map:   0%|          | 0/1807 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

Dataset({
    features: ['subject', 'sender', 'receiver', 'timestamp', 'message_body', 'thread_id', 'email_types', 'email_status', 'email_criticality', 'product_types', 'agent_effectivity', 'agent_efficiency', 'customer_satisfaction', 'label', 'input_ids', 'attention_mask'],
    num_rows: 452
})


In [31]:
results = trainer.evaluate()
print(results)



{'eval_loss': 1.4088151454925537, 'eval_model_preparation_time': 0.0011, 'eval_accuracy': 0.0, 'eval_runtime': 1.0078, 'eval_samples_per_second': 448.503, 'eval_steps_per_second': 14.884}


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [None]:
# Saving the model
model.save("/tmp/your_model_name")

AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'save'

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.