In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "distilbert-base-uncased"
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

Downloading readme: 100%|██████████| 592/592 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 836k/836k [00:10<00:00, 76.5kB/s]
Downloading data: 100%|██████████| 853k/853k [00:02<00:00, 316kB/s]
Generating train split: 100%|██████████| 1000/1000 [00:05<00:00, 179.59 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 1004.24 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [7]:
def tokenize_function(examples):
    text = examples["text"]
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        max_length=512,
        truncation=True,
    )
    return tokenized_inputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 1002.47 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1367.91 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy =  evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels) 

In [11]:
text_list = ["This movie was really bad. I hated it.", "This movie was really good. I loved it."
"Better than the first one", "Worse than the first one", "I don't know how I feel about this movie", "I don't know how I feel about this movie. It was really good.", 
"This one is a pass"]

In [14]:
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + ":", id2label[predictions.tolist()])
    print()

This movie was really bad. I hated it.: Negative

This movie was really good. I loved it.Better than the first one: Negative

Worse than the first one: Negative

I don't know how I feel about this movie: Negative

I don't know how I feel about this movie. It was really good.: Negative

This one is a pass: Negative



In [44]:
peft_config = LoraConfig(task_type="SEQ_CLS", 
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules=["q_lin"])

In [45]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [46]:
lr = 1e-3
batch_size = 8
num_epochs = 4

training_args = TrainingArguments(
    output_dir= model_checkpoint + "lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [47]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  1%|          | 11/1250 [08:42<16:20:45, 47.49s/it]
 25%|██▌       | 125/500 [54:04<2:32:29, 24.40s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   

[A[A                                           
 25%|██▌       | 125/500 [1:19:27<2:32:29, 24.40s/it]
[A
[A

{'eval_loss': 0.2532152831554413, 'eval_accuracy': 0.901, 'eval_runtime': 1522.1665, 'eval_samples_per_second': 0.657, 'eval_steps_per_second': 0.082, 'epoch': 1.0}


 50%|█████     | 250/500 [2:07:04<1:23:32, 20.05s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                     

[A[A                                           
 50%|█████     | 250/500 [2:32:13<1:23:32, 20.05s/it]
[A
[A

{'eval_loss': 0.41752228140830994, 'eval_accuracy': 0.875, 'eval_runtime': 1508.4496, 'eval_samples_per_second': 0.663, 'eval_steps_per_second': 0.083, 'epoch': 2.0}


 75%|███████▌  | 375/500 [3:23:26<53:37, 25.74s/it]    
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                                 

 75%|███████▌  | 375/500 [3:55:39<53:37, 25.74s/it]
[A
[A

{'eval_loss': 0.38052821159362793, 'eval_accuracy': 0.907, 'eval_runtime': 1932.7795, 'eval_samples_per_second': 0.517, 'eval_steps_per_second': 0.065, 'epoch': 3.0}


                                                       
100%|██████████| 500/500 [4:51:13<00:00, 23.58s/it] 

{'loss': 0.2327, 'grad_norm': 5.989250183105469, 'learning_rate': 0.0, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                                 

100%|██████████| 500/500 [5:17:16<00:00, 23.58s/it]
[A
[A

{'eval_loss': 0.4093029201030731, 'eval_accuracy': 0.903, 'eval_runtime': 1563.4023, 'eval_samples_per_second': 0.64, 'eval_steps_per_second': 0.08, 'epoch': 4.0}


                                                   
100%|██████████| 500/500 [5:17:17<00:00, 38.07s/it] 

{'train_runtime': 19037.2774, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.026, 'train_loss': 0.23271583557128905, 'epoch': 4.0}





TrainOutput(global_step=500, training_loss=0.23271583557128905, metrics={'train_runtime': 19037.2774, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.026, 'total_flos': 501434662169664.0, 'train_loss': 0.23271583557128905, 'epoch': 4.0})

In [48]:
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + ":", id2label[predictions.tolist()])
    print()

This movie was really bad. I hated it.: Negative

This movie was really good. I loved it.Better than the first one: Positive

Worse than the first one: Negative

I don't know how I feel about this movie: Negative

I don't know how I feel about this movie. It was really good.: Positive

This one is a pass: Negative

