In [88]:
from trainer_compatible.bert_prefix import BertForSequenceClassification_Prefix
from trainer_compatible.bert_prefix_gated import BertForSequenceClassification_Prefix_Gated
from trainer_compatible.roberta_prefix import RobertaForSequenceClassification_Prefix
from trainer_compatible.roberta_prefix_gated import RobertaForSequenceClassification_Prefix_Gated

In [89]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import random

## All

In [90]:
dataset = load_dataset("stanfordnlp/imdb")
train_ds = dataset["train"]
test_ds = dataset["test"]

In [91]:
random_indices = random.sample(range(len(train_ds)), 10000)
train_ds = train_ds.select(random_indices)

random_indices = random.sample(range(len(test_ds)), 5000)
test_ds = test_ds.select(random_indices)

In [92]:
prefix_len = 4
num_labels = 2

In [93]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## BERT Architecture Models

In [94]:
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [95]:
def preprocess_function(sample):
    return tokenizer(sample["text"], truncation=True, max_length=256)

tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### BERT

In [96]:
config = AutoConfig.from_pretrained(model_name)

In [97]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

In [98]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### BERT Prefix

In [99]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = prefix_len

In [100]:
model = BertForSequenceClassification_Prefix.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not name in ["bert.encoder.prefix", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "classifier.weight", "classifier.bias"]:
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of BertForSequenceClassification_Prefix were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'bert.encoder.prefix']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 628994


In [101]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/bert_prefix_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/6250 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.6252, 'learning_rate': 1.8e-05, 'epoch': 1.0}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.5387895107269287, 'eval_accuracy': 0.779, 'eval_runtime': 52.1508, 'eval_samples_per_second': 95.876, 'eval_steps_per_second': 6.002, 'epoch': 1.0}
{'loss': 0.5205, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.4571014642715454, 'eval_accuracy': 0.7996, 'eval_runtime': 51.216, 'eval_samples_per_second': 97.626, 'eval_steps_per_second': 6.111, 'epoch': 2.0}
{'loss': 0.475, 'learning_rate': 1.4e-05, 'epoch': 3.0}


  0%|          | 0/313 [00:00<?, ?it/s]

{'eval_loss': 0.4244462847709656, 'eval_accuracy': 0.806, 'eval_runtime': 54.5185, 'eval_samples_per_second': 91.712, 'eval_steps_per_second': 5.741, 'epoch': 3.0}


KeyboardInterrupt: 

### BERT Prefix Gated

In [102]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = prefix_len

In [109]:
model = BertForSequenceClassification_Prefix_Gated.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not (name in ["bert.encoder.prefix", "bert.pooler.dense.weight", "bert.pooler.dense.bias", "classifier.weight", "classifier.bias"]) and not ("gate_mlps" in name):
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of BertForSequenceClassification_Prefix_Gated were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['bert.encoder.gate_mlps.7.4.weight', 'bert.encoder.gate_mlps.11.4.bias', 'bert.encoder.gate_mlps.10.0.bias', 'bert.encoder.gate_mlps.3.0.bias', 'bert.encoder.gate_mlps.0.0.bias', 'bert.encoder.gate_mlps.8.2.bias', 'bert.encoder.gate_mlps.4.0.weight', 'bert.encoder.gate_mlps.4.4.bias', 'bert.encoder.gate_mlps.11.0.bias', 'bert.encoder.gate_mlps.2.4.bias', 'bert.encoder.gate_mlps.5.4.weight', 'bert.encoder.gate_mlps.2.0.weight', 'bert.encoder.gate_mlps.2.2.bias', 'bert.encoder.gate_mlps.3.2.weight', 'bert.encoder.gate_mlps.5.2.bias', 'bert.encoder.gate_mlps.0.2.weight', 'bert.encoder.gate_mlps.9.0.bias', 'bert.encoder.gate_mlps.10.4.weight', 'bert.encoder.gate_mlps.11.2.weight', 'bert.encoder.gate_mlps.4.0.bias', 'bert.encoder.gate_mlps.11.0.weight', 'bert.encoder.gate_mlps.2.2.weight', 'bert.encoder.gate_mlps.1.2.weight', 

Number of parameters: 5061902


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/bert_prefix_gated_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## ROBERTA Architecture Models

In [119]:
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [120]:
def preprocess_function(sample):
    return tokenizer(sample["text"], truncation=True, max_length=256)

tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

### ROBERTA

In [None]:
config = AutoConfig.from_pretrained(model_name)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### ROBERTA Prefix

In [124]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = prefix_len

In [125]:
model = RobertaForSequenceClassification_Prefix.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not name in ["roberta.encoder.prefix", "roberta.pooler.dense.weight", "roberta.pooler.dense.bias", "classifier.dense.weight", "classifier.dense.bias", "classifier.out_proj.weight", "classifier.out_proj.bias"]:
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of RobertaForSequenceClassification_Prefix were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'roberta.pooler.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'roberta.pooler.dense.weight', 'roberta.encoder.prefix']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of parameters: 1219586


In [None]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/roberta_prefix_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### ROBERTA Prefix Gated

In [121]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.prefix_len = prefix_len

In [126]:
model = RobertaForSequenceClassification_Prefix_Gated.from_pretrained(model_name, config=config)
for name, param in model.named_parameters():
    if not (name in ["roberta.encoder.prefix", "roberta.pooler.dense.weight", "roberta.pooler.dense.bias", "classifier.dense.weight", "classifier.dense.bias", "classifier.out_proj.weight", "classifier.out_proj.bias"]) and not ("gate_mlps" in name):
        param.requires_grad = False

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {total_params}")

Some weights of RobertaForSequenceClassification_Prefix_Gated were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.encoder.gate_mlps.8.0.weight', 'roberta.encoder.gate_mlps.2.4.weight', 'classifier.out_proj.weight', 'roberta.encoder.gate_mlps.8.4.weight', 'roberta.encoder.gate_mlps.3.4.weight', 'roberta.encoder.gate_mlps.7.4.weight', 'roberta.encoder.gate_mlps.9.0.weight', 'roberta.encoder.gate_mlps.11.0.bias', 'roberta.encoder.gate_mlps.1.2.weight', 'roberta.encoder.gate_mlps.5.4.bias', 'roberta.encoder.gate_mlps.8.2.bias', 'roberta.encoder.gate_mlps.11.4.bias', 'roberta.encoder.gate_mlps.1.4.bias', 'roberta.encoder.gate_mlps.3.2.bias', 'roberta.pooler.dense.bias', 'roberta.encoder.gate_mlps.1.2.bias', 'roberta.encoder.gate_mlps.10.4.weight', 'roberta.encoder.gate_mlps.2.0.bias', 'roberta.pooler.dense.weight', 'roberta.encoder.gate_mlps.0.0.bias', 'roberta.encoder.gate_mlps.5.2.weight', 'roberta.encoder.gate_mlps.3.0.weight', '

Number of parameters: 5652494


In [127]:
training_args = TrainingArguments(
    output_dir=f"./results/sentiment/roberta_prefix_gated_{prefix_len}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    use_mps_device=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/6250 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 