# Game Plan: Base model
1. Tokenize Data: Convert raw text to a format that BERT understands (input IDs, attention masks, and token type IDs). The tokenizer will handle converting text to numbers.

2. Preprocess Data:
a. Input IDs: Token indices from the tokenizer.
b. Attention Masks: Differentiates real tokens from padding tokens.
c. Truncation & Padding: Ensure all sequences are of the same length for batching.

3. DataLoaders: Wrap the processed data into a TensorDataset. Use DataLoader to create iterable data for training and validation.

4. Model Initialization: Initialize BertForSequenceClassification with the number of expected labels.

5. Training Loop: Define an optimizer (like AdamW) and learning rate scheduler.
Train the model on your data while saving checkpoints.

6. Evaluation: After training, evaluate the model on a validation set to check performance.

In [2]:
# Import all packages and stuff
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import pandas as pd

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

#### Why not include positional embeddings?
> ClinicalBERT already maintains position of tokens in its transformer architecture

In [3]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")

In [4]:
data = pd.read_csv(
    r"C:\Users\Simrun Sharma\Desktop\NLP\NLP_final\01_intermediate-files\smokers_train_all_separated.csv"
)

In [5]:
data["Smoking Status"] = data["Smoking Status"].replace("SMOKER", "PAST SMOKER")

In [6]:
le = LabelEncoder()
data["Smoking_enc"] = le.fit_transform(data["Smoking Status"])
display(data.sample(6))

Unnamed: 0,Smoking Status,Text,Smoking_enc
215,UNKNOWN,report status: unsigned admission date: 02/22/...,3
192,UNKNOWN,report status: signed discharge summary name: ...,3
176,UNKNOWN,admission date: 12/19/2004 report status: dis...,3
131,UNKNOWN,report status: unsigned\r\ndischarge summary n...,3
5,CURRENT SMOKER,admission date: 01/13/1990 report status: sign...,0
289,NON-SMOKER,admission date: 11/09/2003 report status: sign...,1


In [7]:
data["Smoking_enc"] = data["Smoking_enc"].astype("int64")
print(data.dtypes)

Smoking Status    object
Text              object
Smoking_enc        int64
dtype: object


In [8]:
print(data["Smoking Status"].value_counts())

UNKNOWN           252
NON-SMOKER         66
PAST SMOKER        45
CURRENT SMOKER     35
Name: Smoking Status, dtype: int64


In [9]:
import re

clean_messages = data["Text"].str.lower()
clean_text = []
for message in clean_messages:
    pattern = r"(\S+\s){0,5}\S*(smok|tobacco|cigar|pack|ppd)\S*(\s\S+){0,5}"
    match = re.search(pattern, message, re.IGNORECASE)

    if match:
        matched_text = match.group(0)
        clean_text.append(matched_text)
    else:
        # sentence = "no information"
        # clean_text.append(sentence)
        clean_text.append(message)

data["Text"] = clean_text
print(data["Text"])

0      to excess , pipe and cigar smoker for many yea...
1       the patient has a 20 pack-year smoking history .
2      has been smoking approximately 10 cigarettes a...
3      and vomiting . social history: smoker for grea...
4                   1-2 packs per day . hospital course:
                             ...                        
393    report status: unsigned\r\ned discharge notifi...
394    report status: unsigned\r\ned discharge notifi...
395          a / p repair vag pack / foley , ebl minimal
396    admission date: 05/25/2002 report status:  dis...
397    admission date: 11/27/2003 report status:  dis...
Name: Text, Length: 398, dtype: object


In [10]:
data_unknown = data[data["Smoking Status"] == "UNKNOWN"]
data_non = data[data["Smoking Status"] == "NON-SMOKER"]
data_past = data[data["Smoking Status"] == "PAST SMOKER"]
data_current = data[data["Smoking Status"] == "CURRENT SMOKER"]

In [11]:
print(data_unknown.shape)
print(data_non.shape)
print(data_past.shape)
print(data_current.shape)

(252, 3)
(66, 3)
(45, 3)
(35, 3)


In [12]:
data_unknown_downsampled = data_unknown.sample(data_non.shape[0])
print(data_unknown_downsampled.shape)

(66, 3)


In [13]:
df = pd.concat([data_unknown_downsampled, data_non, data_past, data_current])
df["Smoking_enc"].value_counts()

3    66
1    66
2    45
0    35
Name: Smoking_enc, dtype: int64

In [14]:
# sentences is a series of sentences where each row is a sentence.
# tokenizer.encode returns a list of token ids for that sentence
# list comprehension applies to tokenizer.encode to each sentence. creating a list of lists
# out list corresponds to all sentences and inner list corresponds to list of token ids of one sentence


def get_sentence_embedding(sentences):
    indexed_tokens = [
        tokenizer.encode(
            sentence, add_special_tokens=True, truncation=True, max_length=512
        )
        for sentence in sentences
    ]
    return indexed_tokens


sentences = df["Text"]

indexed_tokens = get_sentence_embedding(sentences)
print(indexed_tokens)

[[101, 17553, 14042, 131, 15826, 76585, 84153, 13664, 131, 10150, 120, 10273, 120, 11978, 27224, 92555, 13664, 131, 10150, 120, 10197, 120, 11978, 11486, 10108, 12254, 56507, 131, 10531, 10124, 169, 12545, 118, 10924, 118, 12898, 10817, 10169, 27164, 11908, 110106, 20748, 13000, 38333, 10369, 10108, 10105, 66041, 117, 14042, 11841, 10406, 79478, 25032, 10157, 10111, 39429, 71510, 10108, 12153, 10794, 67646, 15794, 11639, 12352, 46111, 10419, 117, 40345, 19353, 10169, 105142, 10111, 14861, 14239, 12708, 10280, 119, 23746, 10261, 10374, 10590, 31391, 169, 35103, 10108, 10950, 33399, 11405, 10262, 34899, 73995, 16740, 10169, 10105, 16700, 12483, 119, 10684, 19436, 20390, 12363, 15165, 37604, 19275, 56859, 17442, 10147, 117, 15165, 12713, 16575, 11268, 36751, 19343, 10280, 117, 15165, 52368, 80236, 10111, 14861, 52530, 54047, 10188, 25468, 11481, 119, 22899, 65548, 131, 10261, 10134, 169, 11206, 118, 14628, 117, 11206, 118, 17899, 41835, 10336, 10817, 10479, 10134, 33989, 54006, 10230, 102

In [15]:
# all sentences don't have the same length so we need to pad all sequences to same length
# so that the dimensions of tensor remain the same.

# Pad the sequences
# for each token find the length keep doing until you find max length
max_length = max(len(tokens) for tokens in indexed_tokens)
# lets say max length is 10 and the sentence has 8. subtract to get 2. so add 2 zeros to orignal tokens
padded_tokens = [tokens + [0] * (max_length - len(tokens)) for tokens in indexed_tokens]

# Create the tensor
input_ids_tensor = torch.tensor(padded_tokens)
print(input_ids_tensor)

tensor([[  101, 17553, 14042,  ...,     0,     0,     0],
        [  101, 17553, 14042,  ...,   169, 12153,   102],
        [  101, 84153, 13664,  ..., 58054, 28849,   102],
        ...,
        [  101, 10105, 38607,  ...,     0,     0,     0],
        [  101,   119, 12142,  ...,     0,     0,     0],
        [  101, 10134, 37241,  ...,     0,     0,     0]])


#### Why does each tensor begin with 101 
> becuase [CLS] token or start token has been added to the beginning of each tokenized sequence as a part of BERT model when we do  add_special_tokens=True in the tokenizer.encode.

#### What are attention masks?
> it is a list that indicates which tokens are words and which are padding. Same length as the input sequences. 1 is for actual words and 0 is for the padding. 

In [16]:
# attention masks
# int(token != 0) is a boolean expression which says when the token is not 0 render it True
# True in python is the integer 1 and by doing int we get 1.
attention_masks = [[int(token != 0) for token in tokens] for tokens in padded_tokens]
attention_masks_tensor = torch.tensor(attention_masks)
print(attention_masks_tensor)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [17]:
# # Tokenize all texts in the dataframe
# tokenized_data = tokenizer(
#     df["Text"].tolist(),
#     add_special_tokens=True,
#     return_tensors="pt",
#     padding=True,
#     truncation=True,
#     max_length=1000,  # Choose the sequence length appropriately
# )

# print(tokenized_data)

In [18]:
# # Separating out the input_ids and attention_masks
# input_ids = tokenized_data["input_ids"]
# attention_masks = tokenized_data["attention_mask"]

In [19]:
# # Define the label to id mapping
# label_to_id = {
#     "CURRENT SMOKER": 1,
#     "PAST SMOKER": 2,
#     "SMOKER": 3,
#     "NON-SMOKER": 0,
#     "UNKNOWN": 4,
# }

# # Replace the textual labels in the dataframe with the corresponding numeric ids
# df["Encoded Labels"] = df["Smoking Status"].map(label_to_id)

# # Convert the label series to a tensor
# labels = torch.tensor(df["Encoded Labels"].values)

In [20]:
# the .values returns the numpy representation of the data so it converst teh column into a numpy array

labels = torch.tensor(df["Smoking_enc"].values)


# Check input shapes

print(f"Input IDs shape: {input_ids_tensor.shape}")

print(f"Attention Masks shape: {attention_masks_tensor.shape}")

print(f"Labels shape: {labels.shape}")

Input IDs shape: torch.Size([212, 512])
Attention Masks shape: torch.Size([212, 512])
Labels shape: torch.Size([212])


In [21]:
# Create a TensorDataset
# (tensor(input_ids), tensor(attention_mask), tensor(1))

dataset = TensorDataset(input_ids_tensor, attention_masks_tensor, labels)
print(dataset[3])

(tensor([   101,  84153,  13664,    131,  10669,    120,  10249,    120,  10420,
         17553,  14042,    131,  15826,  76585,  27224,  92555,  13664,    131,
         10669,    120,  10296,    120,  10420,  11486,  10108,  12254,  56507,
           131,  10105,  38607,  10124,    169,  11817,    118,  10924,    118,
         12898,  10817,  10479,  10374,    169,  15127,  44207,  45157,  10161,
         10106,  29731,  17530,  10106,  10724,  10111,    169,    171,  10362,
         10106,  10677,  10111,  91347,  17339,  42461,  36223,  34597,  55183,
         44320,  10157,  21911,  10479,  10393,  10590,  11419,  69692,  10136,
         19113,  13987,  78780,  10415,  94230,  38576,  10169,  13906,  54609,
         32239,  10135,  11419, 107373,  11764,  11035,    120,  12642,    119,
         10105,  38607,  10374,    169,  23050,  46917,  41163,  33926,  19980,
         20972,  10114,  84153,  10319,  27463,  10832,    110,  28780,  58286,
         10291,  10108,  10105,  10109,

#### Why random sampler and Sequnetial Sampler?
> we use the random seed for reproducibility. Random sampler, samples elements randomly from the dataset (training). Sequential sampler samples elements sequentially from the dataset. in validation, you want to evaluate the enire dataset in a consistent order without shuffling. 

In [22]:
seed = 42

torch.manual_seed(seed)


batch_size = 32


# Create DataLoader for training with a random sampler

train_dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),  # Random sampler for training data
    batch_size=batch_size,
)


# Create DataLoader for validation with a sequential sampler (no need for a seed here)

val_dataloader = DataLoader(
    dataset,
    sampler=SequentialSampler(dataset),  # Sequential sampler for validation data
    batch_size=batch_size,
)

# Model

In [23]:
# Initialize the BERT-based model
model_test = BertForSequenceClassification.from_pretrained(
    "medicalai/ClinicalBERT",
    num_labels=4,
)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.output.LayerNorm.bias', 'encoder.layer.4.attention.output.dense.bias', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.8.attention.self.key.bias', 'encoder.layer.9.attention.self.quer

In [26]:
with torch.no_grad():
    seq_classif_logits = model_test(
        input_ids_tensor, attention_mask=attention_masks_tensor
    )

predicted_labels = torch.argmax(seq_classif_logits[0]).item()
print(predicted_labels)

In [25]:
# from transformers import AdamW, get_linear_schedule_with_warmup
# num_epochs = 10
# # Define optimizer and learning rate scheduler
# optimizer = AdamW(model_test.parameters(), lr=2e-5)
# scheduler = get_linear_schedule_with_warmup(
#     optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs
# )
# # Define a directory to save checkpoints
# checkpoint_dir = "10_code/checkouts-models"
# # Create the directory if it doesn't exist
# import os
# os.makedirs(checkpoint_dir, exist_ok=True)
# # Training loop
# for epoch in range(10):
#     model_test.train()
#     total_train_loss = 0
#     for batch in train_dataloader:
#         input_ids_batch, attention_masks_batch, labels_batch = batch
#         # Forward pass
#         outputs = model_test(
#             input_ids=input_ids_batch,
#             attention_mask=attention_masks_batch,
#             labels=labels_batch,
#         )
#         # Calculate loss and perform backpropagation
#         loss = outputs.loss
#         loss.backward()
#         # Update parameters and learning rate scheduler
#         optimizer.step()
#         scheduler.step()
#         # Accumulate training loss
#         total_train_loss += loss.item()
#     # Calculate average training loss for this epoch
#     avg_train_loss = total_train_loss / len(train_dataloader)
#     # Print or log training loss for this epoch
#     print(
#         f"Epoch {epoch + 1} / {num_epochs}, Average Training Loss: {avg_train_loss:.4f}"
#     )
#     # Save model checkpoint at the end of each epoch
#     checkpoint_filename = f"model_epoch_{epoch + 1}.pt"
#     checkpoint_path = os.path.join(checkpoint_dir, checkpoint_filename)
#     torch.save(model_test.state_dict(), checkpoint_path)
# # Save the final trained model
# final_model_path = "10_code/checkouts-models/model_final.pt"
# torch.save(model_test.state_dict(), final_model_path)