In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.7-py3-none-any.whl (520 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/520.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/520.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m520.4/520.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Co

In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random


In [3]:
def convert_txt_file_to_datasets(filename, label ,data = {'text': [], 'class': []}):
  with open(filename, 'r') as f:
    for line in f:
        line = line.replace('\n', '')
        line = line.replace('...', ',')
        line = line.lstrip('"').rstrip('"')
        data['text'].append(line)
        data['class'].append(label)
    return data

In [4]:
import datasets
import random
from transformers import AutoTokenizer

data = {'text': [], 'class': []}
data = convert_txt_file_to_datasets('food_requests.txt','food',data)
data = convert_txt_file_to_datasets('not_food_requests.txt','others',data)

your_dataset = datasets.Dataset.from_dict(data)

In [5]:
your_dataset

Dataset({
    features: ['text', 'class'],
    num_rows: 600
})

In [6]:
#Train Test Split
your_dataset = your_dataset.train_test_split(test_size=0.2)
train_ds = your_dataset["train"]
test_ds = your_dataset["test"]

In [7]:
t = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
template = "This example is {}."

(…)-mnli/resolve/main/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

(…)bart-large-mnli/resolve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

(…)/bart-large-mnli/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)/bart-large-mnli/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)t-large-mnli/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
#Listing catogaries to clasify
id2labels = ["food", "others"]

In [9]:
def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in id2labels if x != label])

    encoded_sequence = t(
        text*2, [template.format(label), template.format(contradiction_label)])
    encoded_sequence["labels"] = [1, 0]
    encoded_sequence["input_sentence"] = t.batch_decode(
        encoded_sequence.input_ids)

    return encoded_sequence

In [10]:
#Preparing data for finetuning
train_dataset = train_ds.map(
    create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "text"])
test_dataset = test_ds.map(create_input_sequence, batched=True,
                           batch_size=1, remove_columns=["class", "text"])

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [11]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [12]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [13]:
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [14]:
model_directory = r'/content'

In [15]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(id2labels), ignore_mismatched_sizes = True)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
  output_dir = model_directory,
  num_train_epochs = 32,
  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 64,
  warmup_steps = 500,
  weight_decay = 0.01,
)

trainer = Trainer(
  model = model,
  args = training_args,
  compute_metrics = compute_metrics,
  train_dataset = train_dataset,
  eval_dataset = test_dataset,
  tokenizer = tokenizer
)

In [17]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  metric_acc = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.6171573400497437,
 'eval_accuracy': 0.6333333333333333,
 'eval_f1': 0.5793163891323401,
 'eval_runtime': 5.9258,
 'eval_samples_per_second': 40.501,
 'eval_steps_per_second': 0.675}

In [18]:
trainer.train()

Step,Training Loss
500,0.045
1000,0.0
1500,0.0


TrainOutput(global_step=1920, training_loss=0.011725087293082954, metrics={'train_runtime': 1068.5822, 'train_samples_per_second': 28.748, 'train_steps_per_second': 1.797, 'total_flos': 1846478227753152.0, 'train_loss': 0.011725087293082954, 'epoch': 32.0})

In [19]:
trainer.evaluate()

{'eval_loss': 6.561477334798838e-07,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 3.1397,
 'eval_samples_per_second': 76.442,
 'eval_steps_per_second': 1.274,
 'epoch': 32.0}

In [20]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [21]:
sequences = 'I am hungry, can I have some food'
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I am hungry, can I have some food',
 'labels': ['food', 'others'],
 'scores': [0.9999989867210388, 1.0128478606930003e-06]}

In [24]:
import unittest

class TestStringMethods(unittest.TestCase):

    def test_dataset_food(self):
      data = {'text': [], 'class': []}
      data = convert_txt_file_to_datasets('food_requests.txt','food',data)
      data = convert_txt_file_to_datasets('not_food_requests.txt','others',data)
      your_dataset = datasets.Dataset.from_dict(data)
      test_food = len([food for food in your_dataset['class'] if food =='food'])
      self.assertEqual(test_food, 400)

    def test_dataset_others(self):
      data = {'text': [], 'class': []}
      data = convert_txt_file_to_datasets('food_requests.txt','food',data)
      data = convert_txt_file_to_datasets('not_food_requests.txt','others',data)
      your_dataset = datasets.Dataset.from_dict(data)
      test_others = len([others for others in your_dataset['class'] if others =='others'])
      self.assertEqual(test_others, 200)

    def test_positive(self):
        sequences = 'I am feeling hungry.'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertEqual(result['labels'][0], 'food')

    def test_positive_1(self):
        sequences = 'My hunger is back, can you bring me some food?'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertEqual(result['labels'][0], 'food')

    def test_negetive_1(self):
        sequences = 'I need help with connecting to a video call.'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertNotEqual(result['labels'][0], 'food')



unittest.main(argv=[''], exit=False)

.....
----------------------------------------------------------------------
Ran 5 tests in 0.355s

OK


<unittest.main.TestProgram at 0x79bb02329c30>