# Game Plan: Base model
1. Tokenize Data: Convert raw text to a format that BERT understands (input IDs, attention masks, and token type IDs). The tokenizer will handle converting text to numbers.

2. Preprocess Data:
a. Input IDs: Token indices from the tokenizer.
b. Attention Masks: Differentiates real tokens from padding tokens.
c. Truncation & Padding: Ensure all sequences are of the same length for batching.

3. DataLoaders: Wrap the processed data into a TensorDataset. Use DataLoader to create iterable data for training and validation.

4. Model Initialization: Initialize BertForSequenceClassification with the number of expected labels.

5. Training Loop: Define an optimizer (like AdamW) and learning rate scheduler.
Train the model on your data while saving checkpoints.

6. Evaluation: After training, evaluate the model on a validation set to check performance.

In [1]:
import torch
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score



#### Why not include positional embeddings?
> ClinicalBERT already maintains position of tokens in its transformer architecture

In [12]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")

In [13]:
data = pd.read_csv(
    "/Users/robintitus/Desktop/nlp_final/NLP_final/01_intermediate-files/smokers_train_all_separated.csv"
)

In [14]:
data["Smoking Status"] = data["Smoking Status"].replace("SMOKER", "PAST SMOKER")

In [15]:
le = LabelEncoder()
data["Smoking_enc"] = le.fit_transform(data["Smoking Status"])
display(data.sample(6))

Unnamed: 0,Smoking Status,Text,Smoking_enc
258,PAST SMOKER,admission date: 12/13/2002 report status: dis...,2
337,UNKNOWN,report status: signed discharge summary name: ...,3
215,UNKNOWN,report status: unsigned admission date: 02/22/...,3
305,PAST SMOKER,report status: unsigned\ndischarge summary nam...,2
188,UNKNOWN,admission date: 08/23/1999 report status: sign...,3
34,NON-SMOKER,admission date: 06/19/1991 report status: sign...,1


In [16]:
data["Smoking_enc"] = data["Smoking_enc"].astype("int64")
print(data.dtypes)

Smoking Status    object
Text              object
Smoking_enc        int64
dtype: object


In [17]:
print(data["Smoking Status"].value_counts())

Smoking Status
UNKNOWN           252
NON-SMOKER         66
PAST SMOKER        45
CURRENT SMOKER     35
Name: count, dtype: int64


In [18]:
import re

clean_messages = data["Text"].str.lower()
clean_text = []
for message in clean_messages:
    pattern = r"(\S+\s){0,5}\S*(smok|tobacco|cigar|pack|ppd)\S*(\s\S+){0,5}"
    match = re.search(pattern, message, re.IGNORECASE)

    if match:
        matched_text = match.group(0)
        clean_text.append(matched_text)
    else:
        # sentence = "no information"
        # clean_text.append(sentence)
        clean_text.append(message)

data["Text"] = clean_text
print(data["Text"])

0      to excess , pipe and cigar smoker for many yea...
1      the patient has a 20 pack-year smoking history...
2      alcohol use .\nhas been smoking approximately ...
3      and vomiting . social history: smoker for grea...
4      times per week .\n1-2 packs per day . hospital...
                             ...                        
393    report status: unsigned\ned discharge notifica...
394    report status: unsigned\ned discharge notifica...
395          a / p repair vag pack / foley , ebl minimal
396    admission date: 05/25/2002 report status:  dis...
397    admission date: 11/27/2003 report status:  dis...
Name: Text, Length: 398, dtype: object


In [19]:
data_unknown = data[data["Smoking Status"] == "UNKNOWN"]
data_non = data[data["Smoking Status"] == "NON-SMOKER"]
data_past = data[data["Smoking Status"] == "PAST SMOKER"]
data_current = data[data["Smoking Status"] == "CURRENT SMOKER"]

In [20]:
print(data_unknown.shape)
print(data_non.shape)
print(data_past.shape)
print(data_current.shape)

(252, 3)
(66, 3)
(45, 3)
(35, 3)


In [21]:
data_unknown_downsampled = data_unknown.sample(data_non.shape[0])
print(data_unknown_downsampled.shape)

(66, 3)


In [22]:
df = pd.concat([data_unknown_downsampled, data_non, data_past, data_current])
df["Smoking_enc"].value_counts()

Smoking_enc
3    66
1    66
2    45
0    35
Name: count, dtype: int64

In [23]:
# sentences is a series of sentences where each row is a sentence.
# tokenizer.encode returns a list of token ids for that sentence
# list comprehension applies to tokenizer.encode to each sentence. creating a list of lists
# out list corresponds to all sentences and inner list corresponds to list of token ids of one sentence


def get_sentence_embedding(sentences):
    indexed_tokens = [
        tokenizer.encode(
            sentence, add_special_tokens=True, truncation=True, max_length=512
        )
        for sentence in sentences
    ]
    return indexed_tokens


sentences = df["Text"]

indexed_tokens = get_sentence_embedding(sentences)
print(indexed_tokens)

[[101, 84153, 13664, 131, 10150, 120, 11035, 120, 10442, 17553, 14042, 131, 15826, 76585, 27224, 92555, 13664, 131, 10150, 120, 10193, 120, 10442, 48300, 80375, 131, 11572, 22451, 64576, 10161, 10840, 16818, 15983, 38333, 10369, 119, 11486, 10108, 12254, 56507, 131, 10105, 38607, 10124, 169, 11824, 118, 10924, 118, 12898, 91878, 10229, 109680, 117, 10220, 109680, 35394, 10454, 17339, 13565, 15263, 16762, 10169, 15365, 59139, 118, 169, 10840, 16818, 15983, 38333, 10369, 10108, 10105, 11572, 22451, 21570, 10465, 10479, 10134, 40345, 10142, 99110, 10108, 169, 10321, 103393, 56473, 119, 10106, 10814, 120, 12519, 117, 10105, 38607, 14628, 11841, 20260, 11359, 62966, 10718, 56520, 119, 10135, 129, 120, 128, 120, 12519, 117, 169, 107433, 10415, 36031, 11952, 10822, 10111, 77298, 69255, 10134, 15282, 10160, 171, 13020, 10269, 13135, 17004, 15630, 10319, 48201, 70190, 14045, 10797, 177, 120, 109680, 10840, 16818, 15983, 38333, 10369, 10108, 10105, 11572, 22451, 21570, 10465, 119, 22899, 65548, 

In [24]:
# all sentences don't have the same length so we need to pad all sequences to same length
# so that the dimensions of tensor remain the same.

# Pad the sequences
# for each token find the length keep doing until you find max length
max_length = max(len(tokens) for tokens in indexed_tokens)
# lets say max length is 10 and the sentence has 8. subtract to get 2. so add 2 zeros to orignal tokens
padded_tokens = [tokens + [0] * (max_length - len(tokens)) for tokens in indexed_tokens]

# Create the tensor
input_ids_tensor = torch.tensor(padded_tokens)
print(input_ids_tensor)

tensor([[  101, 84153, 13664,  ...,   119, 94614,   102],
        [  101, 84153, 13664,  ..., 39236, 10870,   102],
        [  101, 84153, 13664,  ...,   171,   119,   102],
        ...,
        [  101,   119, 10105,  ...,     0,     0,     0],
        [  101,   119, 12142,  ...,     0,     0,     0],
        [  101, 10134, 37241,  ...,     0,     0,     0]])


In [25]:
# attention masks
# int(token != 0) is a boolean expression which says when the token is not 0 render it True
# True in python is the integer 1 and by doing int we get 1.
attention_masks = [[int(token != 0) for token in tokens] for tokens in padded_tokens]
attention_masks_tensor = torch.tensor(attention_masks)
print(attention_masks_tensor)

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [26]:
# the .values returns the numpy representation of the data so it converst teh column into a numpy array
labels = torch.tensor(df["Smoking_enc"].values)

# Check input shapes
print(f"Input IDs shape: {input_ids_tensor.shape}")
print(f"Attention Masks shape: {attention_masks_tensor.shape}")
print(f"Labels shape: {labels.shape}")

Input IDs shape: torch.Size([212, 512])
Attention Masks shape: torch.Size([212, 512])
Labels shape: torch.Size([212])


In [27]:
# Create a TensorDataset
# (tensor(input_ids), tensor(attention_mask), tensor(1))
dataset = TensorDataset(input_ids_tensor, attention_masks_tensor, labels)
print(dataset[3])

(tensor([   101,  17553,  14042,    131,  15826,  76585,  27224,  92555,  70585,
         11324,    131,  45805,    117,  14382,  11964,  10112,    172,  16511,
         11487,    131,  14048,    118,  10150,    118,  12224,  84153,  13664,
           131,  10814,    120,  10878,    120,  12328,  27224,  92555,  13664,
           131,  10814,    120,  11052,    120,  12328,  11652,  80375,    131,
         31206,  10908,  22544,  10157,  21911,    119,  18107,  80375,    131,
         10124,  34884,  11130,  23050,  10638,  14996,  49482,  54047,    117,
         25461,  55788,  13362,  11130,  21736,  37241,  91136,    117,  15165,
         12713,  69553,  13315,  15684,    117,  15165,  52368,  80236,    117,
         31206,  10908,  26194,  10123,  21911,    117,  19436,  10446,  22530,
        107826,    119,  11652,  48244,    131,  23050,  46917,    173,  16575,
         33414,  10638,  31253,    119,  10684,  70176,    131,  46638,    119,
         11486,  10108,  12254,  56507,

In [28]:
# Dataloading
seed = 42
torch.manual_seed(seed)
batch_size = 32

# Create DataLoader for training with a random sampler
train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),  # Random sampler for training data
    batch_size=batch_size,
)


# Create DataLoader for validation with a sequential sampler (no need for a seed here)
val_dataloader = DataLoader(
    dataset,
    sampler=SequentialSampler(dataset),  # Sequential sampler for validation data
    batch_size=batch_size,
)

# Model

In [None]:
model = BertForSequenceClassification()
model.load_state_dict()

In [30]:
# Initialize the BERT-based model
model_test = BertForSequenceClassification.from_pretrained(
    "medicalai/ClinicalBERT",
    num_labels=4,
)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['encoder.layer.5.attention.self.query.bias', 'embeddings.word_embeddings.weight', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.8.attention.output.dense.weight', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.10.attention.self.value.weight', 'encoder.layer.11.attention.self.value.weight', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.3.outp

In [None]:
# Ensure GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_test.to(device)

In [None]:
num_epochs = 10

# Training Loop with Evaluation
for epoch in range(num_epochs):
    model_test.train()  # Set model to training mode

    # Monitor:
    total_train_loss = 0
    total_train_accuracy = 0
    all_preds = []
    all_labels = []

    for batch in train_dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        labels_batch = labels_batch.to(device)

        model_test.zero_grad()  # Clear any previously calculated gradients

        outputs = model_test(
            input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch
        )
        logits = outputs.logits
        loss = outputs.loss

        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        total_train_accuracy += accuracy_score(labels_batch.cpu(), preds.cpu())
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels_batch.cpu().numpy())

        loss.backward()  # Perform a backward pass to calculate the gradients
        optimizer.step()  # Update params
        scheduler.step()  # Adjust learning rate

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    train_precision = precision_score(
        all_labels, all_preds, average="weighted"
    )  # considering imbalance
    train_recall = recall_score(all_labels, all_preds, average="weighted")
    train_f1 = f1_score(all_labels, all_preds, average="weighted")

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.3f}, Accuracy: {avg_train_accuracy:.3f}")
    print(
        f"Precision: {train_precision:.3f}, Recall: {train_recall:.3f}, F1 Score: {train_f1:.3f}"
    )

    # Validation Step
    model_test.eval()
    total_val_accuracy = 0
    total_val_loss = 0
    val_all_preds = []
    val_all_labels = []

    for batch in val_dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        labels_batch = labels_batch.to(device)

        with torch.no_grad():
            outputs = model_test(
                input_ids_batch,
                attention_mask=attention_masks_batch,
                labels=labels_batch,
            )
            loss = outputs.loss
            logits = outputs.logits

        total_val_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        total_val_accuracy += accuracy_score(labels_batch.cpu(), preds.cpu())
        val_all_preds.extend(preds.cpu().numpy())
        val_all_labels.extend(labels_batch.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_val_accuracy / len(val_dataloader)
    val_precision = precision_score(val_all_labels, val_all_preds, average="weighted")
    val_recall = recall_score(val_all_labels, val_all_preds, average="weighted")
    val_f1 = f1_score(val_all_labels, val_all_preds, average="weighted")

    print(f"Validation Loss: {avg_val_loss:.3f}, Accuracy: {avg_val_accuracy:.3f}")
    print(
        f"Precision: {val_precision:.3f}, Recall: {val_recall:.3f}, F1 Score: {val_f1:.3f}"
    )

# Save the model
model_save_path = (
    "/Users/robintitus/Desktop/nlp_final/NLP_final/01_intermediate-files/model_test.pth"
)
torch.save(model_test.state_dict(), model_save_path)
print("Model saved successfully.")

The reason zero_grad() is called at the beginning of processing each batch is not to get rid of gradients permanently but to reset them. Without resetting gradients, they would accumulate across batches, leading to incorrect updates to the model weights.