In [1]:
!pip install transformers # supports Transformer-based models
!pip install datasets # datasets for experiments
!pip install evaluate # evaluation metrics for experiments
!pip install transformers[torch] # backend for training

import pandas as pd # data manipulation & storage
from tqdm.auto import tqdm
from transformers.utils import logging
from transformers import  set_seed # fix random seed
from google.colab import drive

drive.mount('/content/drive')
logging.set_verbosity_error()
set_seed(0)

output_path = '/content/drive/My Drive/atd'

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow, dill, multiprocess, datasets
  Attempting uninstall: pyarrow
    Found exis

In [54]:
from datasets import load_dataset

dev_ds = load_dataset('csv', data_files='/content/drive/MyDrive/Colab Notebooks/generative-ai/Week4/dev.csv')

train_ds, val_ds = dev_ds['train'].train_test_split(test_size=0.2).values()

In [57]:
val_ds

Dataset({
    features: ['ID', 'Text', 'Class'],
    num_rows: 400
})

In [38]:
test_ds = load_dataset('csv', data_files='/content/drive/MyDrive/Colab Notebooks/generative-ai/Week4/test.csv')

In [39]:
test_ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'Text'],
        num_rows: 20000
    })
})

In [58]:
from datasets import DatasetDict

ds = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds['train']
})

In [59]:
ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'Text', 'Class'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['ID', 'Text', 'Class'],
        num_rows: 400
    })
    test: Dataset({
        features: ['ID', 'Text'],
        num_rows: 20000
    })
})

In [60]:
# map class IDs to labels
id2label = {0: 'H', 1: 'M'}

# map labels to class IDs
label2id = {'H': 0, 'M': 1}


In [61]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Updated preprocess function that checks for 'Class' label existence
def preprocess(batch):
    tokenized_batch = tokenizer(batch['Text'], padding=True, truncation=True, max_length=128)
    # Only add 'label' if 'Class' exists in the batch
    if 'Class' in batch:
        tokenized_batch['label'] = [label2id[label] for label in batch['Class']]
    return tokenized_batch

# Apply preprocessing to the entire DatasetDict
tokenized_ds = ds.map(preprocess, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [62]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'Text', 'Class', 'input_ids', 'attention_mask', 'label'],
        num_rows: 1600
    })
    validation: Dataset({
        features: ['ID', 'Text', 'Class', 'input_ids', 'attention_mask', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['ID', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
})

In [63]:
from transformers import DataCollatorWithPadding # import the DataCollatorWithPadding class from the transformers package

# create an instance of DataCollatorWithPadding
# it takes 'tokenizer' as an argument, which will be used for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [64]:
import evaluate # import the evaluate package

accuracy = evaluate.load('accuracy') # we will use the accuracy metric as the main one

In [65]:
import numpy as np # import the numpy package

# this function hets the predictions (e.g. the probilities of each class, takes the most probable precition and compares it to the gold label)
def compute_metrics(eval_pred):

    # get the prediction probabilities and the gold labels
    predictions, labels = eval_pred

    # get the most likely prediction
    predictions = np.argmax(predictions, axis=1)

    # compute and return the accuracy value
    return accuracy.compute(predictions=predictions, references=labels)

In [66]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer # import necessary components from the transformers library

# initialize a model for sequence classification (e.g. for text classification)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [74]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [71]:
# define the training arguments for the model
training_args = TrainingArguments(
    output_dir=f'tmp/',                           # directory to save the model and results
    learning_rate=2e-5,                            # learning rate for optimization
    per_device_train_batch_size=32,              # batch size per GPU for training
    per_device_eval_batch_size=32,               # batch size per GPU for evaluation
    num_train_epochs=10,                           # number of training epochs
    weight_decay=0.01,                            # weight decay for regularization
    evaluation_strategy='epoch',                  # evaluation strategy during training (per epoch)
    save_strategy='epoch',                        # saving strategy during training (per epoch)
    load_best_model_at_end=True,                  # load the best model at the end of training
)

# intialize the Trainer with necessary components and settings
trainer = Trainer(
    model=model,                                  # model to be trained
    args=training_args,                           # training arguments defined above
    train_dataset=tokenized_ds['train'],          # training dataset
    eval_dataset=tokenized_ds['validation'],      # validation dataset
    tokenizer=tokenizer,                          # tokenizer for data processing
    data_collator=data_collator,                  # data collator for padding
    compute_metrics=compute_metrics               # function to compute evaluation metrics
)

In [72]:
# train the model
trainer.train()

{'eval_loss': 0.12071572989225388, 'eval_accuracy': 0.9725, 'eval_runtime': 1.5412, 'eval_samples_per_second': 259.538, 'eval_steps_per_second': 8.435, 'epoch': 1.0}
{'eval_loss': 0.11525195837020874, 'eval_accuracy': 0.975, 'eval_runtime': 1.538, 'eval_samples_per_second': 260.083, 'eval_steps_per_second': 8.453, 'epoch': 2.0}
{'eval_loss': 0.12932606041431427, 'eval_accuracy': 0.9725, 'eval_runtime': 1.4878, 'eval_samples_per_second': 268.847, 'eval_steps_per_second': 8.738, 'epoch': 3.0}
{'eval_loss': 0.1426873654127121, 'eval_accuracy': 0.9725, 'eval_runtime': 1.5028, 'eval_samples_per_second': 266.171, 'eval_steps_per_second': 8.651, 'epoch': 4.0}
{'eval_loss': 0.15597018599510193, 'eval_accuracy': 0.975, 'eval_runtime': 1.5357, 'eval_samples_per_second': 260.463, 'eval_steps_per_second': 8.465, 'epoch': 5.0}
{'eval_loss': 0.15083158016204834, 'eval_accuracy': 0.98, 'eval_runtime': 1.5455, 'eval_samples_per_second': 258.821, 'eval_steps_per_second': 8.412, 'epoch': 6.0}
{'eval_los

TrainOutput(global_step=500, training_loss=0.0054072275161743165, metrics={'train_runtime': 233.4097, 'train_samples_per_second': 68.549, 'train_steps_per_second': 2.142, 'train_loss': 0.0054072275161743165, 'epoch': 10.0})

In [73]:
predictions = trainer.predict(tokenized_ds['test']).predictions

predicted_class_indices = np.argmax(predictions, axis=1)
predicted_labels = [id2label[i] for i in predicted_class_indices]

In [75]:
submission_df = pd.DataFrame({
    'ID': test_ds['train']['ID'],
    'Class': predicted_labels
})

submission_path = '/content/drive/MyDrive/Colab Notebooks/generative-ai/Week4/submission_10_epochs.csv'
submission_df.to_csv(submission_path, index=False)

print(f"Submission file saved to {submission_path}")

Submission file saved to /content/drive/MyDrive/Colab Notebooks/generative-ai/Week4/submission_10_epochs.csv
