In [1]:
import torch
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")

In [3]:
data = pd.read_csv(
    "/workspaces/NLP_Proj2/01_intermediate-files/smokers_train_all_separated.csv"
)

In [4]:
data["Smoking Status"] = data["Smoking Status"].replace("SMOKER", "PAST SMOKER")

In [5]:
le = LabelEncoder()
data["Smoking_enc"] = le.fit_transform(data["Smoking Status"])
display(data.sample(6))

Unnamed: 0,Smoking Status,Text,Smoking_enc
375,UNKNOWN,admission date: 08/22/2002 report status: dis...,3
78,PAST SMOKER,report status: unsigned admission date: 10/20/...,2
348,UNKNOWN,admission date: 12/29/2000 report status: dis...,3
343,UNKNOWN,report status: unsigned\ned discharge notifica...,3
379,UNKNOWN,report status: unsigned\ned discharge notifica...,3
34,NON-SMOKER,admission date: 06/19/1991 report status: sign...,1


In [6]:
data["Smoking_enc"] = data["Smoking_enc"].astype("int64")
print(data.dtypes)

Smoking Status    object
Text              object
Smoking_enc        int64
dtype: object


In [7]:
print(data["Smoking Status"].value_counts())

UNKNOWN           252
NON-SMOKER         66
PAST SMOKER        45
CURRENT SMOKER     35
Name: Smoking Status, dtype: int64


In [8]:
import re

clean_messages = data["Text"].str.lower()
clean_text = []
for message in clean_messages:
    pattern = r"(\S+\s){0,5}\S*(smok|tobacco|cigar|pack|ppd)\S*(\s\S+){0,5}"
    match = re.search(pattern, message, re.IGNORECASE)

    if match:
        matched_text = match.group(0)
        clean_text.append(matched_text)
    else:
        # sentence = "no information"
        # clean_text.append(sentence)
        clean_text.append(message)

data["Text"] = clean_text
print(data["Text"])


In [13]:
def get_sentence_embedding(sentences):
    indexed_tokens = [
        tokenizer.encode(
            sentence, add_special_tokens=True, truncation=True, max_length=512
        )
        for sentence in sentences
    ]
    return indexed_tokens


sentences = data["Text"]

indexed_tokens = get_sentence_embedding(sentences)
#print(indexed_tokens)
len(indexed_tokens[10])

512

In [15]:
# Pad the sequences
# for each token find the length keep doing until you find max length
max_length = max(len(tokens) for tokens in indexed_tokens)
# lets say max length is 10 and the sentence has 8. subtract to get 2. so add 2 zeros to orignal tokens
padded_tokens = [tokens + [0] * (max_length - len(tokens)) for tokens in indexed_tokens]

# Create the tensor
input_ids_tensor = torch.tensor(padded_tokens)
print(input_ids_tensor)

tensor([[  101, 17553, 14042,  ..., 11324,   117,   102],
        [  101, 84153, 13664,  ...,   119, 10361,   102],
        [  101, 17553, 14042,  ..., 11823, 10111,   102],
        ...,
        [  101, 84153, 13664,  ..., 10230,   119,   102],
        [  101, 84153, 13664,  ...,   186,   114,   102],
        [  101, 84153, 13664,  ..., 10109, 10162,   102]])


In [16]:
# True in python is the integer 1 and by doing int we get 1.
attention_masks = [[int(token != 0) for token in tokens] for tokens in padded_tokens]
attention_masks_tensor = torch.tensor(attention_masks)
print(attention_masks_tensor)

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])


In [17]:
# the .values returns the numpy representation of the data so it converst teh column into a numpy array
labels = torch.tensor(data["Smoking_enc"].values)

# Check input shapes
print(f"Input IDs shape: {input_ids_tensor.shape}")
print(f"Attention Masks shape: {attention_masks_tensor.shape}")
print(f"Labels shape: {labels.shape}")

Input IDs shape: torch.Size([398, 512])
Attention Masks shape: torch.Size([398, 512])
Labels shape: torch.Size([398])


In [18]:
# Create a TensorDataset
# (tensor(input_ids), tensor(attention_mask), tensor(1))
dataset = TensorDataset(input_ids_tensor, attention_masks_tensor, labels)
print(dataset[3])

(tensor([   101,  84153,  13664,    131,  10907,    120,  10193,    120,  10436,
         17553,  14042,    131,  14775,  27224,  92555,  13664,    131,  10907,
           120,  10233,    120,  10436,  27949,  10142,  84153,    131,    181,
         10129,    119,  20944,  10124,    169,  10843,    118,  10924,    118,
         12898,    117,  15263,  17416,    117,  14042,  11841,  15127,  44207,
         45157,  10161,  10106,  29731,  17530,    192,  10551,    117,  14042,
         11841,  31206,  10908,  22544,  10157,  10155,  36388,  26194,  10123,
           192,  11003,    117,  10169,  10151,  86923,  17530, 107433,  10108,
         10197,    110,  10135,  37241,  65921,  11424,  14590,    117,  10479,
         41175,  10169,  11639,  12352,  46111,  10123,  13716,  14010,  10108,
         33989,  54006,    117,  94230,  38576,    117,  10111,  15765,    118,
         33336,  14010,    119,  11486,  10108,  12254,  56507,    131,  10105,
         38607,  10393,    169,  11486,

In [21]:
len(input_ids_tensor)

398

In [22]:
def create_dataset(input_ids, attention_masks, labels, start_idx, end_idx):
    return TensorDataset(
        input_ids[start_idx:end_idx], 
        attention_masks[start_idx:end_idx], 
        labels[start_idx:end_idx]
    )

def get_data_from_dataset(dataset, idx):
    return dataset.tensors[0][idx], dataset.tensors[1][idx], dataset.tensors[2][idx]

# Assuming input_ids_tensor, attention_masks_tensor, and labels are already defined
train_dataset = create_dataset(input_ids_tensor, attention_masks_tensor, labels, 0, 350)
val_dataset = create_dataset(input_ids_tensor, attention_masks_tensor, labels, 350, len(input_ids_tensor))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

# Example of accessing an item
item = get_data_from_dataset(train_dataset, 0)  # Get the first item of the training dataset


# Model

In [23]:
# Initialize the BERT-based model
model_test = BertForSequenceClassification.from_pretrained(
    "medicalai/ClinicalBERT",
    num_labels=4,
)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.10.attention.self.value.weight', 'classifier.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.8.output.dense.weight', 'encoder.layer.6.attention.self.query.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.10.attention.output.LayerNorm.weight', 'e

In [24]:
# Ensure GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_test.to(device)

Using device: cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [25]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt

num_epochs = 20

total_steps = len(train_dataloader) * num_epochs
num_warmup_steps = total_steps * 0.1  # 10% of total steps
# print(len(train_dataloader)) # its 7
# print(num_warmup_steps)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model_test.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=70, num_training_steps=700
)

training_loss_values = []
validation_loss_values = []


# Training Loop with Evaluation
for epoch in range(num_epochs):
    model_test.train()  # Set model to training mode

    # Monitor:
    total_train_loss = 0
    total_train_accuracy = 0
    all_preds = []
    all_labels = []

    for batch in train_dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        labels_batch = labels_batch.to(device)

        model_test.zero_grad()  # Clear any previously calculated gradients

        outputs = model_test(
            input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch
        )
        logits = outputs.logits
        loss = outputs.loss

        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        total_train_accuracy += accuracy_score(labels_batch.cpu(), preds.cpu())
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels_batch.cpu().numpy())

        loss.backward()  # Perform a backward pass to calculate the gradients
        optimizer.step()  # Update params
        scheduler.step()  # Adjust learning rate

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    training_loss_values.append(avg_train_loss)

    train_precision = precision_score(
        all_labels, all_preds, average="weighted"
    )  # considering imbalance
    train_recall = recall_score(all_labels, all_preds, average="weighted")
    train_f1 = f1_score(all_labels, all_preds, average="weighted")

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.3f}, Accuracy: {avg_train_accuracy:.3f}")
    print(
        f"Precision: {train_precision:.3f}, Recall: {train_recall:.3f}, F1 Score: {train_f1:.3f}"
    )

    # Validation Step
    model_test.eval()
    total_val_accuracy = 0
    total_val_loss = 0
    val_all_preds = []
    val_all_labels = []

    for batch in val_dataloader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        labels_batch = labels_batch.to(device)

        with torch.no_grad():
            outputs = model_test(
                input_ids_batch,
                attention_mask=attention_masks_batch,
                labels=labels_batch,
            )
            loss = outputs.loss
            logits = outputs.logits

        total_val_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        total_val_accuracy += accuracy_score(labels_batch.cpu(), preds.cpu())
        val_all_preds.extend(preds.cpu().numpy())
        val_all_labels.extend(labels_batch.cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_val_accuracy / len(val_dataloader)
    validation_loss_values.append(avg_val_loss)

    val_precision = precision_score(val_all_labels, val_all_preds, average="weighted")
    val_recall = recall_score(val_all_labels, val_all_preds, average="weighted")
    val_f1 = f1_score(val_all_labels, val_all_preds, average="weighted")

    print(f"Validation Loss: {avg_val_loss:.3f}, Accuracy: {avg_val_accuracy:.3f}")
    print(
        f"Precision: {val_precision:.3f}, Recall: {val_recall:.3f}, F1 Score: {val_f1:.3f}"
    )

# Plotting the training and validation loss after the training loop
plt.plot(range(1, num_epochs + 1), training_loss_values, 'b-', label='Training loss')
plt.plot(range(1, num_epochs + 1), validation_loss_values, 'r-', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [27]:
import os

# Save the model
model_save_path = os.path.expanduser("/workspaces/NLP_Proj2/model_test_syn_23.pth")

# Create the directory if it does not exist
model_save_directory = os.path.dirname(model_save_path)
os.makedirs(model_save_directory, exist_ok=True)

torch.save(model_test.state_dict(), model_save_path)
print("Model saved successfully.")

Model saved successfully.


The reason zero_grad() is called at the beginning of processing each batch is not to get rid of gradients permanently but to reset them. Without resetting gradients, they would accumulate across batches, leading to incorrect updates to the model weights.