# Patient Restroom Chatbot Model

In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

## Importing Libraries

In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random


## Data Processing

In [3]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
  for i in range(cycles):
    new_df = old_df.sample(frac=1).reset_index(drop=True)
  return new_df

In [4]:
# **********************Refactored part of code**************************
def processFile(fileName,className):
  with open(fileName, 'r') as f:
    for line in f:
        line = line.replace('\n', '')
        line = line.replace('...', ',')
        line = line.lstrip('"').rstrip('"')
        data['text'].append(line)
        data['class'].append(className)

In [5]:
import datasets
import random
from transformers import AutoTokenizer

data = {'text': [], 'class': []}

restroomFile = 'restroom_requests.txt'
othersFile = 'not_restroom_requests.txt'

processFile(restroomFile, 'restroom')
processFile(othersFile, 'others')

your_dataset = datasets.Dataset.from_dict(data)


In [6]:
t = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
template = "This example is {}."

id2labels = ["restroom", "others"]


def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in id2labels if x != label])

    encoded_sequence = t(
        text*2, [template.format(label), template.format(contradiction_label)])
    encoded_sequence["labels"] = [1, 0]
    encoded_sequence["input_sentence"] = t.batch_decode(
        encoded_sequence.input_ids)

    return encoded_sequence

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Splitting the dataset

In [7]:
your_dataset = your_dataset.train_test_split(test_size=0.2)
train_ds = your_dataset["train"]
test_ds = your_dataset["test"]

In [8]:
train_dataset = train_ds.map(
    create_input_sequence, batched=True, batch_size=1, remove_columns=["class", "text"])
test_dataset = test_ds.map(create_input_sequence, batched=True,
                           batch_size=1, remove_columns=["class", "text"])

Map:   0%|          | 0/323 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

## Creating the model and evaluator

In [9]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

In [10]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

In [11]:
def compute_metrics(p: EvalPrediction):
  metric_acc = load_metric("accuracy")
  metric_f1 = load_metric("f1")
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  preds = np.argmax(preds, axis = 1)
  result = {}
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
  return result

In [12]:
model_directory = r'/content'

In [13]:
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(id2labels), ignore_mismatched_sizes = True)

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
  output_dir = model_directory,      # Output directory
  num_train_epochs = 32,             # Total number of training epochs
  per_device_train_batch_size = 16,  # Batch size per device during training
  per_device_eval_batch_size = 64,   # Batch size for evaluation
  warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
  weight_decay = 0.01,               # Strength of weight decay
)

trainer = Trainer(
  model = model,                     # The instantiated model to be trained
  args = training_args,              # Training arguments, defined above
  compute_metrics = compute_metrics, # A function to compute the metrics
  train_dataset = train_dataset,     # Training dataset
  eval_dataset = test_dataset,       # Evaluation dataset
  tokenizer = tokenizer              # The tokenizer that was used
)

## Training and testing the model

In [15]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  metric_acc = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.6744452118873596,
 'eval_accuracy': 0.6790123456790124,
 'eval_f1': 0.6718092566619915,
 'eval_runtime': 4.9637,
 'eval_samples_per_second': 32.637,
 'eval_steps_per_second': 0.604}

In [16]:
trainer.train()

Step,Training Loss
500,0.0608
1000,0.0339


TrainOutput(global_step=1312, training_loss=0.03804681377439964, metrics={'train_runtime': 622.1167, 'train_samples_per_second': 33.228, 'train_steps_per_second': 2.109, 'total_flos': 1057393131964152.0, 'train_loss': 0.03804681377439964, 'epoch': 32.0})

In [17]:
trainer.evaluate()

{'eval_loss': 3.771264891838655e-05,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 1.8009,
 'eval_samples_per_second': 89.956,
 'eval_steps_per_second': 1.666,
 'epoch': 32.0}

In [18]:
from transformers import pipeline

In [19]:
classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [20]:
sequences = 'I need restroom'

In [61]:
sequences2 = 'Hello Where is Office'

In [22]:
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I need restroom',
 'labels': ['restroom', 'others'],
 'scores': [0.9991323947906494, 0.0008675978751853108]}

In [62]:
classifier(sequences2, id2labels, multi_label=False)

{'sequence': 'Hello Where is Office',
 'labels': ['others', 'restroom'],
 'scores': [0.601777970790863, 0.3982219994068146]}

## Saving the model

In [24]:
model_path = r'/content'

In [25]:
model.save_pretrained(model_path)

In [26]:
!zip -r /content/model.zip /content/pytorch_model.bin /content/config.json

  adding: content/config.json (deflated 58%)


In [27]:
from google.colab import files
files.download("/content/model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
reloadtrainer = BartForSequenceClassification.from_pretrained('/content/')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [29]:
classifier = pipeline("zero-shot-classification", model = reloadtrainer, tokenizer = tokenizer, device = 0)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [30]:
classifier(sequences, id2labels, multi_label=False)

{'sequence': 'I need restroom',
 'labels': ['restroom', 'others'],
 'scores': [0.9991323947906494, 0.0008675978751853108]}