## Chieftains of the Northwind

In [None]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Section 1: Clean Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataset_triage.csv')
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",non-urgent
1,I have been having abdominal pain and burning ...,non-urgent
2,"sir, Day before yesterday i had an oil fried i...",urgent
3,"friend has a lump where their coccyx is, has b...",urgent
4,Which demographic should raise suspicion of a ...,non-urgent
...,...,...
42508,My wife is having sharp pains in left chest ab...,non-urgent
42509,BACKGROUND: Survivors of critical coronavirus ...,non-urgent
42510,"Hi, My mum had an operation the doctors though...",urgent
42511,My 6 yr old daughter has had a ever for 3 dats...,urgent


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  object
 1   triage    42513 non-null  object
dtypes: object(2)
memory usage: 664.4+ KB


In [4]:
#NAN row checking
df[df.isnull().T.any().T]

Unnamed: 0,question,triage


In [5]:
df['question'] = df['question'].astype('string')
df['triage'].value_counts()

triage
non-urgent    30799
urgent        11714
Name: count, dtype: int64

In [6]:
id2label = {0: 'non-urgent', 1: 'urgent'}
df['triage'] = df['triage'].apply(lambda x: 0 if x == 'non-urgent' else 1)
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",0
1,I have been having abdominal pain and burning ...,0
2,"sir, Day before yesterday i had an oil fried i...",1
3,"friend has a lump where their coccyx is, has b...",1
4,Which demographic should raise suspicion of a ...,0
...,...,...
42508,My wife is having sharp pains in left chest ab...,0
42509,BACKGROUND: Survivors of critical coronavirus ...,0
42510,"Hi, My mum had an operation the doctors though...",1
42511,My 6 yr old daughter has had a ever for 3 dats...,1


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  string
 1   triage    42513 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 664.4 KB


### Section 2: Prepare Data for Training

In [2]:
%pip install accelerate -U


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
from sklearn.model_selection import train_test_split
import evaluate
from transformers import DistilBertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch as pt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Adding the below cell after initial training, trying different models

In [54]:
# from transformers import ElectraTokenizer, ElectraForSequenceClassification
# model_name = "google/electra-small-discriminator"
# tokenizer = ElectraTokenizer.from_pretrained(model_name)
# model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["question"], df["triage"], test_size=0.2, shuffle=True)

train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True).array
val_labels = val_labels.reset_index(drop=True).array

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

class QuestionsDataset(pt.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: pt.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = pt.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_data = QuestionsDataset(train_encodings, train_labels)
val_data = QuestionsDataset(val_encodings, val_labels)

In [53]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def init_trainer(model, epochs = 2, out_dir = "./results"):

    training_args = TrainingArguments(
        output_dir = out_dir,
        num_train_epochs = epochs,
        evaluation_strategy='steps',
        logging_dir='./logs',
        logging_steps = 10,
        logging_first_step = True,
        warmup_steps = 500,
        weight_decay=0.01,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 64,
        gradient_accumulation_steps = 8, 
        gradient_checkpointing=True,
        # fp16 = True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        compute_metrics=compute_metrics,
    )

    return trainer

### 3. Fine-tune the Model

In [17]:
trainer1 = init_trainer(model, 2, "./results/train1") # this model is DistilBert

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<transformers.trainer.Trainer at 0x7b2b764fb640>

In [19]:
trainer1.train()

Step,Training Loss
1,1.3424
100,1.0009
200,0.5546
300,0.5419
400,0.5572
500,0.5233
600,0.5288
700,0.5196
800,0.5261
900,0.4896


TrainOutput(global_step=4252, training_loss=0.4703510521104172, metrics={'train_runtime': 3275.0055, 'train_samples_per_second': 20.769, 'train_steps_per_second': 1.298, 'total_flos': 9010432456581120.0, 'train_loss': 0.4703510521104172, 'epoch': 2.0})

In [21]:
trainer1.save_model()

In [20]:
trainer1.evaluate()

{'eval_loss': 0.5091649889945984,
 'eval_accuracy': 0.7637304480771493,
 'eval_runtime': 148.4871,
 'eval_samples_per_second': 57.264,
 'eval_steps_per_second': 0.896,
 'epoch': 2.0}

In [None]:
trainer2 = init_trainer(model, 2, "./results/train2") # this model is Roberta

In [55]:
trainer2.train()

  0%|          | 0/264 [01:58<?, ?it/s]
                                                 
  0%|          | 1/530 [00:58<8:38:49, 58.85s/it]

{'loss': 0.7174, 'learning_rate': 1.0000000000000001e-07, 'epoch': 0.0}


                                                  
  2%|▏         | 10/530 [09:58<8:38:16, 59.80s/it]

{'loss': 0.7247, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                  
[A                                    
  2%|▏         | 10/530 [23:59<8:38:16, 59.80s/it]


{'eval_loss': 0.7149145603179932, 'eval_accuracy': 0.2788427613783371, 'eval_runtime': 841.6844, 'eval_samples_per_second': 10.102, 'eval_steps_per_second': 0.158, 'epoch': 0.04}


  2%|▏         | 11/530 [25:02<45:54:19, 318.42s/it]

KeyboardInterrupt: 

In [None]:
trainer2.save_model()
trainer2.evaluate()

### 4. Test the AI

In [1]:
%pip install accelerate -U
from transformers import DistilBertForSequenceClassification, AutoTokenizer
import torch as pt
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
id2label = {0: 'non-urgent', 1: 'urgent'}


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = DistilBertForSequenceClassification.from_pretrained("./results/train1", local_files_only=True)

In [3]:
def test_model(message):
  encodings = tokenizer(message, truncation=True, padding=True, return_tensors="pt")
  output = model(**encodings)
  result = pt.argmax(output.logits).item()
  return id2label[result]

test_model("Please help, what should I do? My father fell and he can't get up.")

'urgent'

In [4]:
test_model("What does it mean if my son gets sick every time we go to swimming lessons?")

'non-urgent'

In [5]:
test_model("Please help!! My baby is crying urgently!") # bad parent, not urgent

'urgent'

In [6]:
test_model("What do I do? My mommy isn't waking up I'm poking her and she won't move.") # mom could be dead, urgent

'urgent'

AI generated examples

In [7]:
examples_urgent = (
    "I'm experiencing severe chest pain and shortness of breath. I think I may be having a heart attack.",
    "My child has a high fever of 104°F and has been vomiting for the past 6 hours. We need to be seen immediately.",
    "I just had a bad fall and I think my arm might be broken. I'm in a lot of pain and need to come in right away.",
    "I'm pregnant and I'm bleeding heavily. I'm very concerned and need to speak with a doctor as soon as possible.",
    "I've been experiencing severe abdominal pain and diarrhea for the past 24 hours. I'm feeling extremely weak and dehydrated.")

for example in examples_urgent:
    print(test_model(example))

urgent
urgent
urgent
urgent
urgent


In [8]:
examples_nonurgent = (
    "Hello, I was wondering if I could schedule an appointment for a routine check-up next week?",
    "I need to refill my prescription for blood pressure medication. Could you please send a refill request to my pharmacy?",
    "I have a question about the side effects of the medication I'm currently taking. Is there a good time I could speak with a nurse?",
    "I'd like to get a flu shot this year. Can you let me know when the clinic has the flu vaccine available?",
    "I need to update my insurance information on file. Could you please provide me with the necessary forms?",)

for example in examples_nonurgent:
    print(test_model(example))
    

non-urgent
non-urgent
non-urgent
non-urgent
non-urgent


In [9]:
examples_trick = (
    "Please help!! My baby is crying urgently!",
    "I have a terrible headache, it's an emergency!",
    "I need to schedule a very urgent appointment for a papercut.",
    "It's an absolute crisis, I broke a nail!",
    "This is an emergency! I ran out of my daily multivitamin.",
    "I'm having an urgent problem with a hangnail, I need to be seen right away!",
    "Emergency! I stubbed my toe and it's a little bruised.",
    "Urgent request: I need a refill on my dandruff shampoo.",
    "This is an urgent matter - I need the phone number for a dog groomer.",
    "I'm having an emergency allergic reaction to pollen. Just kidding, I'm fine!"
)

for example in examples_trick:
    print(test_model(example))

urgent
urgent
urgent
urgent
urgent
urgent
urgent
urgent
urgent
non-urgent


In [12]:
examples_false_neg = (
    "There's some blood in my urine, but I did just start a new medication so that might be causing it.",
    "I'm feeling a little dizzy and lightheaded, but I'm sure it's nothing serious.",
    "I have a sharp pain in my lower abdomen, but it comes and goes so I'm not too worried.",
    "I found a lump in my breast, but I'm sure it's just a harmless cyst.",
    "I was playing basketball and took a hard fall on my knee. It's swollen but I can still walk on it.",
        "My son fell off the monkey bars at the park and hit his head pretty hard. He says he feels fine though.",
    "I've had this really bad headache for 3 days straight now. Could be a migraine or something?",
    "I'm a type 1 diabetic and my blood sugar has been over 300 for most of the day. Should I be concerned?",
    "My Dad is 75 and has been having trouble remembering things lately. Is that just normal aging?", 
    "I'm 32 weeks pregnant and haven't felt the baby move much today. Could you let me know if that's ok?",
    "I've had some tightness in my chest and arm numbness for a few hours now. Maybe I slept weird?",
    "My 3-year-old daughter spiked a fever of 103 this morning but she's acting totally normal otherwise.",
    "There's blood in my stool, maybe it's just hemorrhoids? I've had them before so hopefully that's all it is."
)

for example in examples_false_neg:
    print(test_model(example))

non-urgent
non-urgent
non-urgent
non-urgent
non-urgent
urgent
non-urgent
non-urgent
non-urgent
non-urgent
non-urgent
non-urgent
non-urgent
