In [2]:
from trainer_compatible.bert_prefix import BertForSequenceClassification_Prefix
from trainer_compatible.bert_prefix_gated import BertForSequenceClassification_Prefix_Gated
from trainer_compatible.roberta_prefix import RobertaForSequenceClassification_Prefix
from trainer_compatible.roberta_prefix_gated import RobertaForSequenceClassification_Prefix_Gated

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import random

## All

In [4]:
dataset = load_dataset("nyu-mll/multi_nli")
train_ds = dataset["train"]
test_ds = dataset["validation_matched"]

In [5]:
print(len(train_ds))

392702


In [6]:
random_indices = random.sample(range(len(train_ds)), 100000)
train_ds = train_ds.select(random_indices)

random_indices = random.sample(range(len(test_ds)), 2500)
test_ds = test_ds.select(random_indices)

In [7]:
prefix_len = 4
num_labels = 3

In [8]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## BERT Architecture Models

In [9]:
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
def join_fields(sample):
    joined_field = sample["premise"] + tokenizer.sep_token + sample["hypothesis"]
    sample["text"] = joined_field
    return sample

train_ds = train_ds.map(join_fields)
test_ds = test_ds.map(join_fields)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [23]:
def preprocess_function(sample):
    return tokenizer(sample["text"], truncation=True, max_length=256)

tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

### BERT

In [10]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 109484547


In [65]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/31250 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 

: 

### BERT Prefix

In [16]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = 64

In [17]:
model = BertForSequenceClassification_Prefix.from_pretrained(model_name, config=config)

for name, param in model.named_parameters():
    if not name in ["bert.encoder.prefix", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "classifier.weight", "classifier.bias"]:
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of BertForSequenceClassification_Prefix were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.weight', 'bert.encoder.prefix', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 1182723


In [26]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/bert_prefix_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/12500 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.0692, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 1.0236743688583374, 'eval_accuracy': 0.508, 'eval_runtime': 17.542, 'eval_samples_per_second': 142.515, 'eval_steps_per_second': 8.95, 'epoch': 1.0}
{'loss': 1.0391, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 1.0021919012069702, 'eval_accuracy': 0.536, 'eval_runtime': 9.1248, 'eval_samples_per_second': 273.978, 'eval_steps_per_second': 17.206, 'epoch': 2.0}
{'train_runtime': 1551.9996, 'train_samples_per_second': 128.866, 'train_steps_per_second': 8.054, 'train_loss': 1.0541569921875, 'epoch': 2.0}


TrainOutput(global_step=12500, training_loss=1.0541569921875, metrics={'train_runtime': 1551.9996, 'train_samples_per_second': 128.866, 'train_steps_per_second': 8.054, 'train_loss': 1.0541569921875, 'epoch': 2.0})

### BERT Prefix Gated

In [22]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = 64

In [23]:
model = BertForSequenceClassification_Prefix_Gated.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not (name in ["bert.encoder.prefix", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "classifier.weight", "classifier.bias"]) and not ("gate_mlps" in name):
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of BertForSequenceClassification_Prefix_Gated were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['bert.encoder.gate_mlps.3.4.bias', 'bert.encoder.gate_mlps.1.4.weight', 'bert.encoder.gate_mlps.5.4.bias', 'bert.encoder.gate_mlps.6.2.weight', 'bert.encoder.gate_mlps.0.4.bias', 'bert.encoder.gate_mlps.4.4.weight', 'bert.encoder.gate_mlps.3.2.bias', 'bert.encoder.gate_mlps.2.2.weight', 'bert.encoder.gate_mlps.6.0.bias', 'bert.encoder.gate_mlps.6.2.bias', 'bert.encoder.gate_mlps.6.4.bias', 'bert.encoder.gate_mlps.10.4.weight', 'bert.encoder.gate_mlps.5.0.bias', 'bert.encoder.gate_mlps.10.2.weight', 'bert.encoder.gate_mlps.2.2.bias', 'bert.encoder.gate_mlps.8.4.weight', 'bert.encoder.gate_mlps.9.0.weight', 'bert.encoder.gate_mlps.11.4.weight', 'bert.encoder.gate_mlps.4.0.bias', 'bert.encoder.gate_mlps.7.4.weight', 'classifier.weight', 'bert.encoder.gate_mlps.11.0.bias', 'bert.encoder.gate_mlps.4.4.bias', 'bert.encoder.gate

Number of parameters: 5615631


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/bert_prefix_gated_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## ROBERTA Architecture Models

In [25]:
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
def join_fields(sample):
    joined_field = sample["premise"] + tokenizer.sep_token + sample["hypothesis"]
    sample["text"] = joined_field
    return sample

train_ds = train_ds.map(join_fields)
test_ds = test_ds.map(join_fields)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
def preprocess_function(sample):
    return tokenizer(sample["text"], truncation=True, max_length=256)

tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

### ROBERTA

In [27]:
config = AutoConfig.from_pretrained(model_name)

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 124647170


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### ROBERTA Prefix

In [33]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = 64

In [34]:
model = RobertaForSequenceClassification_Prefix.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not name in ["roberta.encoder.prefix", "roberta.pooler.dense.weight", "roberta.pooler.dense.bias", "classifier.dense.weight", "classifier.dense.bias", "classifier.out_proj.weight", "classifier.out_proj.bias"]:
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of RobertaForSequenceClassification_Prefix were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'roberta.pooler.dense.bias', 'roberta.encoder.prefix', 'classifier.out_proj.weight', 'roberta.pooler.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 1773315


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/roberta_prefix_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### ROBERTA Prefix Gated

In [39]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = 64

In [40]:
model = RobertaForSequenceClassification_Prefix_Gated.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not (name in ["roberta.encoder.prefix", "roberta.pooler.dense.weight", "roberta.pooler.dense.bias", "classifier.dense.weight", "classifier.dense.bias", "classifier.out_proj.weight", "classifier.out_proj.bias"]) and not ("gate_mlps" in name):
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of RobertaForSequenceClassification_Prefix_Gated were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.encoder.gate_mlps.10.2.weight', 'roberta.encoder.gate_mlps.3.0.bias', 'roberta.encoder.gate_mlps.7.0.bias', 'roberta.encoder.gate_mlps.5.4.weight', 'roberta.pooler.dense.bias', 'roberta.encoder.gate_mlps.6.2.bias', 'roberta.encoder.gate_mlps.2.4.bias', 'roberta.encoder.gate_mlps.5.0.bias', 'roberta.encoder.gate_mlps.11.0.weight', 'roberta.encoder.gate_mlps.7.2.weight', 'roberta.encoder.prefix', 'roberta.encoder.gate_mlps.4.4.bias', 'roberta.encoder.gate_mlps.9.0.bias', 'roberta.encoder.gate_mlps.8.4.bias', 'roberta.pooler.dense.weight', 'roberta.encoder.gate_mlps.11.2.weight', 'roberta.encoder.gate_mlps.1.4.weight', 'roberta.encoder.gate_mlps.3.2.bias', 'roberta.encoder.gate_mlps.10.4.bias', 'classifier.out_proj.weight', 'roberta.encoder.gate_mlps.0.2.weight', 'roberta.encoder.gate_mlps.6.0.weight', 'classifier.dens

Number of parameters: 6206223


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/multinli/roberta_prefix_gated_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/6250 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 