## Chieftains of the Northwind

In [None]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Section 1: Clean Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('triage_dataset.csv')
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",non-urgent
1,I have been having abdominal pain and burning ...,non-urgent
2,"sir, Day before yesterday i had an oil fried i...",urgent
3,"friend has a lump where their coccyx is, has b...",urgent
4,Which demographic should raise suspicion of a ...,non-urgent
...,...,...
42508,My wife is having sharp pains in left chest ab...,non-urgent
42509,BACKGROUND: Survivors of critical coronavirus ...,non-urgent
42510,"Hi, My mum had an operation the doctors though...",urgent
42511,My 6 yr old daughter has had a ever for 3 dats...,urgent


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  object
 1   triage    42513 non-null  object
dtypes: object(2)
memory usage: 664.4+ KB


In [4]:
#NAN row checking
df[df.isnull().T.any().T]

Unnamed: 0,question,triage


In [5]:
df['question'] = df['question'].astype('string')
df['triage'].unique()

array(['non-urgent', 'urgent'], dtype=object)

In [6]:
# Make a new column for the triage urgency, using an integer classification, where 0 is non-urgent and 1 is urgent
df['triage'] = df['triage'].apply(lambda x: 0 if x == 'non-urgent' else 1)
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",0
1,I have been having abdominal pain and burning ...,0
2,"sir, Day before yesterday i had an oil fried i...",1
3,"friend has a lump where their coccyx is, has b...",1
4,Which demographic should raise suspicion of a ...,0
...,...,...
42508,My wife is having sharp pains in left chest ab...,0
42509,BACKGROUND: Survivors of critical coronavirus ...,0
42510,"Hi, My mum had an operation the doctors though...",1
42511,My 6 yr old daughter has had a ever for 3 dats...,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  string
 1   triage    42513 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 664.4 KB


In [8]:
# #Upload CSV
# df_compiled.to_csv('input_refined.csv')

### Section 2: Prepare Data for Model

In [9]:
%pip install accelerate -U



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch as pt
import numpy as np

In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["question"], df["triage"], test_size=0.2, shuffle=True)

train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True).array
val_labels = val_labels.reset_index(drop=True).array

In [12]:
train_texts

0        Hi, My father, age 68 had done his ultrasound ...
1        In my blood profile ESR is 68 MCV 77.80 MCH 24...
2        I have severe boasts of nausea and diarrhea wi...
3        My mother is 72 yrs old.  she had gamma knife ...
4        Hi there, I have ED and have been on viagra fo...
                               ...                        
34005    BACKGROUND: Ebola virus causes a hemorrhagic f...
34006    Hi I am 23 and I have been on nexaplon since n...
34007    Hi Dr,my problem i har songs running in mind c...
34008    Extra bone growed in both the ankles, due to t...
34009    Following the proven concept, capabilities, an...
Name: question, Length: 34010, dtype: string

In [13]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [14]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [15]:
class QuestionsDataset(pt.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: pt.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = pt.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_data = QuestionsDataset(train_encodings, train_labels)
val_data = QuestionsDataset(val_encodings, val_labels)

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(y_true=labels, y_pred=predictions)}

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    logging_first_step = True,
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, compute_metrics=compute_metrics
)
trainer

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<transformers.trainer.Trainer at 0x7b2b764fb640>

In [18]:
print(type(train_labels))
print(train_labels.shape)

<class 'pandas.core.arrays.numpy_.PandasArray'>
(34010,)


### Train the Model

In [19]:
trainer.train()

Step,Training Loss
1,1.3424
100,1.0009
200,0.5546
300,0.5419
400,0.5572
500,0.5233
600,0.5288
700,0.5196
800,0.5261
900,0.4896


TrainOutput(global_step=4252, training_loss=0.4703510521104172, metrics={'train_runtime': 3275.0055, 'train_samples_per_second': 20.769, 'train_steps_per_second': 1.298, 'total_flos': 9010432456581120.0, 'train_loss': 0.4703510521104172, 'epoch': 2.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.5091649889945984,
 'eval_accuracy': 0.7637304480771493,
 'eval_runtime': 148.4871,
 'eval_samples_per_second': 57.264,
 'eval_steps_per_second': 0.896,
 'epoch': 2.0}

In [21]:
trainer.save_model()