In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers



In [None]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random


In [None]:
def convert_txt_file_to_datasets(filename, label ,data = {'text': [], 'class': []}):
  with open(filename, 'r') as f:
    for line in f:
        line = line.replace('\n', '')
        line = line.replace('...', ',')
        line = line.lstrip('"').rstrip('"')
        data['text'].append(line)
        data['class'].append(label)
    return data

In [118]:
import datasets
import random
from transformers import AutoTokenizer

data = {'text': [], 'class': []}
data = convert_txt_file_to_datasets('water_requests.txt','water',data)
data = convert_txt_file_to_datasets('not_water_requests.txt','others',data)

your_dataset = datasets.Dataset.from_dict(data)

In [None]:
#Train Test Split
your_dataset = your_dataset.train_test_split(test_size=0.2)
train_ds = your_dataset["train"]
test_ds = your_dataset["test"]

In [None]:
t = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
template = "This example is {}."

In [None]:
#Listing catogaries to clasify
id2labels = ["water", "others"]

In [None]:
def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in id2labels if x != label])

    encoded_sequence = t(
        text*2, [template.format(label), template.format(contradiction_label)])
    encoded_sequence["labels"] = [1, 0]
    encoded_sequence["input_sentence"] = t.batch_decode(
        encoded_sequence.input_ids)

    return encoded_sequence

In [None]:
#Preparing data for finetuning
train_dataset = train_ds.map(
    create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "text"])
test_dataset = test_ds.map(create_input_sequence, batched=True,
                           batch_size=1, remove_columns=["class", "text"])

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

In [None]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [None]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [None]:
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [None]:
model_directory = r'/content'

In [None]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(id2labels), ignore_mismatched_sizes = True)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
  output_dir = model_directory,
  num_train_epochs = 32,
  per_device_train_batch_size = 16,
  per_device_eval_batch_size = 64,
  warmup_steps = 500,
  weight_decay = 0.01,
)

trainer = Trainer(
  model = model,
  args = training_args,
  compute_metrics = compute_metrics,
  train_dataset = train_dataset,
  eval_dataset = test_dataset,
  tokenizer = tokenizer
)

In [None]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.658859133720398,
 'eval_accuracy': 0.6551724137931034,
 'eval_f1': 0.6423065063213075,
 'eval_runtime': 2.6244,
 'eval_samples_per_second': 44.2,
 'eval_steps_per_second': 0.762}

In [None]:
trainer.train()

Step,Training Loss
500,0.0399


TrainOutput(global_step=928, training_loss=0.02617238359204654, metrics={'train_runtime': 444.8137, 'train_samples_per_second': 33.093, 'train_steps_per_second': 2.086, 'total_flos': 778000758545664.0, 'train_loss': 0.02617238359204654, 'epoch': 32.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.1838577687740326,
 'eval_accuracy': 0.9827586206896551,
 'eval_f1': 0.9827586206896551,
 'eval_runtime': 2.1301,
 'eval_samples_per_second': 54.457,
 'eval_steps_per_second': 0.939,
 'epoch': 32.0}

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [None]:
sequences = 'Is there a specific diet plan I should follow post-surgery?'
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'Is there a specific diet plan I should follow post-surgery?',
 'labels': ['others', 'water'],
 'scores': [0.9999831914901733, 1.674555824138224e-05]}

In [137]:
import unittest

class TestStringMethods(unittest.TestCase):

    def test_dataset_water(self):
      data = {'text': [], 'class': []}
      data = convert_txt_file_to_datasets('water_requests.txt','water',data)
      data = convert_txt_file_to_datasets('not_water_requests.txt','others',data)
      your_dataset = datasets.Dataset.from_dict(data)
      test_water = len([water for water in your_dataset['class'] if water =='water'])
      self.assertEqual(test_water, 188)

    def test_dataset_others(self):
      data = {'text': [], 'class': []}
      data = convert_txt_file_to_datasets('water_requests.txt','water',data)
      data = convert_txt_file_to_datasets('not_water_requests.txt','others',data)
      your_dataset = datasets.Dataset.from_dict(data)
      test_others = len([others for others in your_dataset['class'] if others =='others'])
      self.assertEqual(test_others, 100)

    def test_positive(self):
        sequences = 'I am feeling thirsty. May I have a glass of water?'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertEqual(result['labels'][0], 'water')

    def test_positive_1(self):
        sequences = 'Glass of water, please.'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertEqual(result['labels'][0], 'water')

    def test_negetive_1(self):
        sequences = 'Is there a specific diet plan I should follow post-surgery?'
        result = classifier(sequences, id2labels, multi_label=False)
        self.assertNotEquals(result['labels'][0], 'water')



unittest.main(argv=[''], exit=False)

  self.assertNotEquals(result['labels'][0], 'water')
...
----------------------------------------------------------------------
Ran 5 tests in 0.360s

OK


<unittest.main.TestProgram at 0x7c7180960e80>