In [None]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Sentimental analysis based on given sentences

In [3]:
model_checkpoint = "distilbert-base-uncased"

# label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

# classification model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load dataset

In [4]:
dataset = load_dataset("shawhin/imdb-truncated") # optionally: imdb for full dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

### Create tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

### Define tokenizer function

In [None]:
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    # tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", truncation=True, max_length=512
    )

    return tokenized_inputs

In [7]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 8129.79 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

### Create data collator for dynamically padding shorter sequences

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    return {"accuracy": accuracy.compute(predictions=preds, references=labels)}

In [14]:
sample_text_list = ["It was good.", "Not a fan, don't recommand.", "Better than first one.", "Not worth the time.", "This one is a pass"]

print("Non fine-tuned model predictions:")
print("-----------------------------------")

for text in sample_text_list:
    # tokenize
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # get predicted label
    preds = torch.argmax(logits)

    print(f"{text} - {id2label[preds.tolist()]}")

Non fine-tuned model predictions:
-----------------------------------
It was good. - Positive
Not a fan, don't recommand. - Positive
Better than first one. - Positive
Not worth the time. - Positive
This one is a pass - Positive


In [16]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",# sequence classification
    r=4, # intrinsic rank of trainable weight matrix
    lora_alpha=32, # lora LR
    lora_dropout=0.01, # lora dropout
    target_modules=["q_lin"] # target layer for lora -> query linear layer
)

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


### hyperparameters

In [18]:
lr = 1e-3
batch_size = 4
num_epochs = 10

### training arguments

In [None]:
training_args = TrainingArguments(
    output_dir=model_checkpoint + "_lora-text-cls",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### trainer class

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
