## Chieftains of the Northwind

In [1]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Section 1: Clean Data

In [2]:
import pandas as pd

In [19]:
df = pd.read_csv('data/triage_dataset.csv')
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",non-urgent
1,I have been having abdominal pain and burning ...,non-urgent
2,"sir, Day before yesterday i had an oil fried i...",urgent
3,"friend has a lump where their coccyx is, has b...",urgent
4,Which demographic should raise suspicion of a ...,non-urgent
...,...,...
42508,My wife is having sharp pains in left chest ab...,non-urgent
42509,BACKGROUND: Survivors of critical coronavirus ...,non-urgent
42510,"Hi, My mum had an operation the doctors though...",urgent
42511,My 6 yr old daughter has had a ever for 3 dats...,urgent


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  object
 1   triage    42513 non-null  object
dtypes: object(2)
memory usage: 664.4+ KB


In [21]:
#NAN row checking
df[df.isnull().T.any().T]

Unnamed: 0,question,triage


In [22]:
df['question'] = df['question'].astype('string')
df['triage'].unique()

array(['non-urgent', 'urgent'], dtype=object)

In [24]:
# Make a new column for the triage urgency, using an integer classification, where 0 is non-urgent and 1 is urgent
df['triage'] = df['triage'].apply(lambda x: 0 if x == 'non-urgent' else 1)
df

Unnamed: 0,question,triage
0,"I am 35 years old unmarried , i was diagonized...",0
1,I have been having abdominal pain and burning ...,0
2,"sir, Day before yesterday i had an oil fried i...",1
3,"friend has a lump where their coccyx is, has b...",1
4,Which demographic should raise suspicion of a ...,0
...,...,...
42508,My wife is having sharp pains in left chest ab...,0
42509,BACKGROUND: Survivors of critical coronavirus ...,0
42510,"Hi, My mum had an operation the doctors though...",1
42511,My 6 yr old daughter has had a ever for 3 dats...,1


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42513 entries, 0 to 42512
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  42513 non-null  string
 1   triage    42513 non-null  int64 
dtypes: int64(1), string(1)
memory usage: 664.4 KB


In [38]:
# #Upload CSV
# df_compiled.to_csv('input_refined.csv')

### Section 2: Prepare Data for Model

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch as pt
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
#Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df["question"], df["triage"], test_size=0.2, shuffle=True) 

# Reset the index
train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True).array
val_labels = val_labels.reset_index(drop=True).array

In [30]:
train_texts

0        hi.kindly pls help with the result of my urina...
1        I recently had a 5 day stay in the hospital du...
2        My daughter is having excruciating pain on the...
3        What genetic mutation is associated with Beckw...
4        I had an (untreated by antibiotics) sinus infe...
                               ...                        
34005    Hello doctor, I have been struggling a lot wit...
34006    This paper aims to examine the impact of Covid...
34007    What is Erythema nodosum and where does it typ...
34008    Hello Dr.I have query regarding bacterial infe...
34009    After a night of drinking on an empty stomach,...
Name: question, Length: 34010, dtype: string

In [31]:
# Load DistilBERT model and tokenizer 
model_name = "distilbert-base-uncased-finetuned-sst-2-english" 
tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = DistilBertForSequenceClassification.from_pretrained(model_name) 

In [32]:
# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [33]:
# Format the data into a Dataset class
class QuestionsDataset(pt.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: pt.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = pt.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_data = QuestionsDataset(train_encodings, train_labels)
val_data = QuestionsDataset(val_encodings, val_labels)

In [34]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(y_true=labels, y_pred=predictions)}

In [35]:
training_args = TrainingArguments(
    output_dir='./results',           
    num_train_epochs=2,               
    per_device_train_batch_size = len(train_texts) // 1000,   
    per_device_eval_batch_size= len(val_data) // 1000,
    warmup_steps=50,  
    weight_decay=0.01,  
    logging_dir='./logs', 
    logging_steps=10,
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_data, eval_dataset=val_data, compute_metrics=compute_metrics
)
trainer

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


<transformers.trainer.Trainer at 0x1407256c0>

In [36]:
print(type(train_labels))
print(train_labels.shape)

<class 'pandas.core.arrays.numpy_.PandasArray'>
(34010,)


### Train the Model

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()