# Import Dependencies

In [3]:
from datasets import DatasetDict, Dataset, load_from_disk

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# Import Base Model

In [4]:
model_checkpoint = "distilbert-base-uncased"

# Define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# Generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Load dataset

In [6]:
# load dataset
dataset = load_from_disk("../data-manipulation/recipe-classification-dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 28
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 28
    })
})

In [7]:
count = 0
for example in dataset["validation"]:
    print(example)
    
    if count >= 28:
        break

    print()
    count += 1

{'label': 1, 'text': 'Day,Recipe, \nmon,tuna pasta \nsat,lemon pasta \nsun,lentil salad \n,\nIngredient,Quantity\ntuna_tinned,10.00 oz\nbasil,3.00 oz\nlinguine,1.00 lb\nolive,12.00 oz\nmushroom,14.00 oz\npenne,1.00 lb\ncream,8.00 oz\nlentils,8.00 oz\nrocket,5.00 oz\ntomato,5 items\nlemon,1 items\nchives,4 items\naubergine,1 items\npepper,1 items\n'}

{'label': 0, 'text': 'Day,Recipe, \nmon,risotto ai funghi\ntue,pepper pasta\nwed,cous cous\nthu,tuna pasta \nfri,panini paffuti\n,\nIngredient,Quantity\nrice,8.00 oz\nmushroom,8.00 oz\nbutter,8.00 oz\npenne,1.00 lb\nsauce,30.00 oz\ncouscous,1.00 lb\ncumin,1.00 tbs\nturmeric,1.00 tbs\nolive_oil,4.00 tbs\nolive,28.00 oz\ntuna_tinned,10.00 oz\nbasil,3.00 oz\nlinguine,1.00 lb\nprosciutto,8.00 oz\nspinach,8.00 oz\nonion,3 items\nsausage,4 items\npepper,4 items\ntomato,11 items\ncourgette,2 items\naubergine,1 items\nbun,4 items\nprovolone,4 items\n'}

{'label': 1, 'text': 'Day,Recipe, \nmon,pepper pasta\nsat,pesto pasta\nsun,tuna pasta \n,\nIngr

# Preprocess Data

In [8]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# Add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
# Create tokenize function
def tokenize_function(examples):
    # Extract text
    text = examples["text"]

    # Tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [10]:
# Tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 28
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 28
    })
})

In [11]:
count = 0
for example in tokenized_dataset["train"]:
    print(example)
    
    if count >= 1:
        break

    print()
    count += 1

{'label': 1, 'text': 'Day,Recipe, \nmon,broccoli pasta\nsun,caprese pasta\n,\nIngredient,Quantity\npenne,2.00 lb\nbroccoli,10.00 oz\nvegetable_stock,2.00 oz\ncherry_tomatoes,1.00 lb\nolive,14.00 oz\nonion,1 items\nmozzarella_cheese,1 items\nbasil,1 items\n', 'input_ids': [101, 2154, 1010, 17974, 1010, 12256, 1010, 22953, 21408, 3669, 24857, 3103, 1010, 6178, 6072, 2063, 24857, 1010, 21774, 1010, 11712, 9502, 2063, 1010, 1016, 1012, 4002, 6053, 22953, 21408, 3669, 1010, 2184, 1012, 4002, 11472, 15415, 1035, 4518, 1010, 1016, 1012, 4002, 11472, 9115, 1035, 12851, 1010, 1015, 1012, 4002, 6053, 9724, 1010, 2403, 1012, 4002, 11472, 20949, 1010, 1015, 5167, 9587, 20715, 21835, 1035, 8808, 1010, 1015, 5167, 14732, 1010, 1015, 5167, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

{'label': 0, 

In [12]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation metrics

In [13]:
# Import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

# Define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, 
                                          references=labels)}

# Apply Untrained Model to Text

In [14]:
# define list of examples
text_list = [
    "Day,Recipe, \nmon,pepper pasta\nsat,panini paffuti\nsun,lentil salad \n,\nIngredient,Quantity\npenne,1.00 lb\nsauce,30.00 oz\nprosciutto,8.00 oz\nspinach,8.00 oz\nlentils,8.00 oz\nrocket,5.00 oz\npepper,3 items\nonion,1 items\nbun,4 items\ntomato,2 items\nprovolone,4 items\naubergine,1 items\n", 
    "Day,Recipe, \ntue,pizza\nwed,classic pasta\nthu,fish and rice\nfri,panini paffuti\n,\nIngredient,Quantity\npizza,1.00 lb\npotato_wedges,8.00 oz\npenne,1.00 lb\nsauce,30.00 oz\ntuna_fresh,12.00 oz\nrice,6.00 oz\npea,4.00 oz\nolive_oil,4.00 tbs\nprosciutto,8.00 oz\nspinach,8.00 oz\naubergine,1 items\ncourgette,2 items\nonion,1 items\ncarrot,2 items\nlemon,1 items\nbun,4 items\ntomato,2 items\nprovolone,4 items\n", 
    "Day,Recipe, \nmon,pasta con fagioli \nsun,penne amore \n,\nIngredient,Quantity\nrigatoni,1.00 lb\nsauce,30.00 oz\nkidney_bean,14.00 oz\npenne,1.00 lb\nham,8.00 oz\ncream,8.00 oz\npea,8.00 oz\nonion,1 items\npepper,1 items\nchives,3 items\n", 
    "Day,Recipe, \nmon,risotto ai funghi\ntue,pepper pasta\nwed,cous cous\nthu,tuna pasta \nfri,panini paffuti\n,\nIngredient,Quantity\nrice,8.00 oz\nmushroom,8.00 oz\nbutter,8.00 oz\npenne,1.00 lb\nsauce,30.00 oz\ncouscous,1.00 lb\ncumin,1.00 tbs\nturmeric,1.00 tbs\nolive_oil,4.00 tbs\nolive,28.00 oz\ntuna_tinned,10.00 oz\nbasil,3.00 oz\nlinguine,1.00 lb\nprosciutto,8.00 oz\nspinach,8.00 oz\nonion,3 items\nsausage,4 items\npepper,4 items\ntomato,11 items\ncourgette,2 items\naubergine,1 items\nbun,4 items\nprovolone,4 items\n'",
]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # Tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # Compute logits
    logits = model(inputs).logits
    # Convert logits to label
    predictions = torch.argmax(logits)

    print(text[0:50].replace("\n", "") + " - " +id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
Day,Recipe, mon,pepper pastasat,panini paffutis - Negative
Day,Recipe, tue,pizzawed,classic pastathu,fish  - Negative
Day,Recipe, mon,pasta con fagioli sun,penne amor - Negative
Day,Recipe, mon,risotto ai funghitue,pepper past - Negative


# Fine tuning with LoRa

In [15]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", # Sequence classification
    r=4, # Intrinsic rank of trainable weight matrix
    lora_alpha=32, # This is like a learning rate
    lora_dropout=0.01, # Probablity of dropout
    target_modules = ['q_lin'] # We apply lora to query layer only
) 

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


## Hyperparameters

In [17]:
lr = 1e-3 # Size of optimization step 
batch_size = 5 # Number of examples processed per optimziation step
num_epochs = 20 # Number of times model runs through training data

training_args = TrainingArguments(
    output_dir= model_checkpoint + "-recipe-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

## Train

In [18]:
# Create trainer object
trainer = Trainer(
    model=model, # Our peft model
    args=training_args, # Hyperparameters
    train_dataset=tokenized_dataset["train"], # Training data
    eval_dataset=tokenized_dataset["validation"], # Validation data
    tokenizer=tokenizer, # Define tokenizer
    data_collator=data_collator, # This will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, # Evaluates model using compute_metrics() function from before
)

# Train model
trainer.train()


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.654158,{'accuracy': 0.5714285714285714}
2,No log,0.624799,{'accuracy': 0.5714285714285714}
3,No log,0.605968,{'accuracy': 0.5714285714285714}
4,No log,0.294615,{'accuracy': 0.9285714285714286}
5,No log,0.049015,{'accuracy': 1.0}
6,No log,0.003801,{'accuracy': 1.0}
7,No log,0.00286,{'accuracy': 1.0}
8,No log,0.010376,{'accuracy': 1.0}
9,No log,0.051593,{'accuracy': 0.9642857142857143}
10,No log,0.444209,{'accuracy': 0.8928571428571429}


TrainOutput(global_step=120, training_loss=0.14541549682617189, metrics={'train_runtime': 316.2695, 'train_samples_per_second': 1.771, 'train_steps_per_second': 0.379, 'total_flos': 20965035746184.0, 'train_loss': 0.14541549682617189, 'epoch': 20.0})

In [19]:
model.to("cpu") # Moving to cpu
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # Moving to cpu

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text[:50].replace("\n", "") + " - " + id2label[predictions.tolist()[0]])


Trained model predictions:
--------------------------
Day,Recipe, mon,pepper pastasat,panini paffutis - Positive
Day,Recipe, tue,pizzawed,classic pastathu,fish  - Negative
Day,Recipe, mon,pasta con fagioli sun,penne amor - Positive
Day,Recipe, mon,risotto ai funghitue,pepper past - Negative
