<b> Import Libraries </b>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

In [2]:
import torch #PyTorch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW 
from torch.utils.data import DataLoader, Dataset
import evaluate #Calculate metrics like accuracy

<b> Read File & EDA </b>

In [3]:
#Load Data
data = pd.read_csv("F:\\New\\5. Sentiment Analysis\\Patient Feedback Data - Kaggle\\doctorReviews.csv")

In [4]:
#Renaming the columns
data.columns = ["PatientID", "Review", "Label", "Tag"]

In [5]:
#Read data
data.sample(n=5)

Unnamed: 0,PatientID,Review,Label,Tag
69,74,this radiologist is always late where many pat...,0,negative
9,114,not available on time need to improve availabi...,0,negative
45,136,the doctor was not available at the time when ...,0,negative
79,66,very cordial explained indepth for me to under...,1,positive
28,6,the appointment was for my family member and w...,1,positive


In [6]:
#Shape
data.shape

(143, 4)

In [7]:
#Check for null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PatientID  143 non-null    int64 
 1   Review     143 non-null    object
 2   Label      143 non-null    int64 
 3   Tag        143 non-null    object
dtypes: int64(2), object(2)
memory usage: 4.6+ KB


In [8]:
#Convert Review to String Datatype
data["Review"] = data["Review"].astype(str)

<b> Split the data into Train and Test data</b>

In [9]:
train, test = train_test_split(data, test_size=0.2, random_state=40)

In [10]:
#Shape of Train and Test
print("Train:", train.shape)
print("Test:", test.shape)

Train: (114, 4)
Test: (29, 4)


In [11]:
#Extracts text and labels for training and testing splits
train_reviews = train["Review"].astype(str)
train_labels = train["Label"]
test_reviews = test["Review"].astype(str)
test_labels = test["Label"]

In [12]:
train_reviews = train_reviews.astype(str).tolist()
test_reviews = test_reviews.astype(str).tolist()

<b> BERT Tokenizer and Model </b>

In [13]:
#Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Loads a pre-trained BERT tokenizer (bert-base-uncased), which splits text into tokens compatible with the BERT model.

input_ids: Tokenized IDs of the text, including special tokens:
- [101] = [CLS] (start of the sentence)
- [102] = [SEP] (end of the sentence)

attention_mask: Indicates which tokens are real (1) and which are padding (0)

In [14]:
#Loads a pre-trained BERT model for binary classification (num_labels=2)
model_pr = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert-base-uncased:

- Pre-trained BERT model with 12 transformer layers.
- Processes input_ids and computes contextual embeddings for each token.
- The [CLS] token’s embedding is used as the representation of the entire input sequence.

Classification Head:

- A fully connected layer is applied to the [CLS] token’s embedding
- Outputs logits: raw scores for each class (positive and negative sentiment)

<b> Custom Dataset Class </b>

In [31]:
class PatientReviewDataset(Dataset):
    #Initializes the dataset with tokenized encodings and corresponding labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    #Returns the number of examples in the dataset
    def __len__(self):
        #Use the minimum length to avoid indexing issues
        return min(len(self.encodings["input_ids"]), len(self.labels))

    #Retrieves a single example at a given index as a dictionary containing: input_ids, attention_mask, and labels
    def __getitem__(self, idx):
        # Ensure the index is within bounds
        if idx >= len(self.labels):
            raise IndexError(f"Index {idx} out of bounds for labels of size {len(self.labels)}")
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx] if isinstance(self.labels, pd.Series) else self.labels[idx])
        return item

<b> Tokenization </b>

In [32]:
#Tokenizes the text data
train_encodings = tokenizer(train_reviews, truncation=True, padding=True, max_length=128) 
test_encodings = tokenizer(test_reviews, truncation=True, padding=True, max_length=128)

truncation=True: Truncates text longer than 128 tokens

padding=True: Pads shorter text to 128 tokens

<b> Dataset and DataLoader

In [33]:
#Creates IMDbDataset objects for training and testing data
train_dataset = PatientReviewDataset(train_encodings, train_labels)
test_dataset = PatientReviewDataset(test_encodings, test_labels)

In [34]:
#Wraps datasets in DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

batch_size=16: Each batch contains 16 examples

shuffle=True: Shuffles training data

<b> Optimizer </b>

In [35]:
#Configures the optimizer with model parameters and a learning rate of 2e-5
optimizer = AdamW(model_pr.parameters(), lr=2e-5)

<b> Device Configuration </b>

In [36]:
#Moves the model to GPU (cuda) if available; otherwise, uses CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_pr.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

<b> Training Loop </b>

In [37]:
#Trains the model for 3 epochs
for epoch in range(3): 
    model_pr.train()
    total_loss = 0

In [38]:
print(f"Train Encodings: {len(train_encodings['input_ids'])}")
print(f"Train Labels: {len(train_labels)}")

Train Encodings: 114
Train Labels: 114


In [39]:
train_labels = train_labels.reset_index(drop=True)

In [40]:
for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model_pr(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()
    loss.backward()
    optimizer.step()

For each batch:

- Clears gradients: optimizer.zero_grad()

- Processes inputs: input_ids, attention_mask, and labels

- Computes loss

- Backpropagates gradients: loss.backward()

- Updates model parameters: optimizer.step()

In [41]:
#Prints the average loss after each epoch
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 3, Loss: 0.7051668912172318


<b> Evaluation </b>

In [42]:
#Loads the accuracy matrix
accuracy_metric = evaluate.load("accuracy")

In [44]:
#Sets the model to evaluation mode and initializes storage for predictions and labels
model_pr.eval()
predictions = []
references = []

In [46]:
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = model_pr(input_ids, attention_mask=attention_mask)
    preds = torch.argmax(outputs.logits, dim=-1)
    predictions.extend(preds.cpu().numpy())
    references.extend(labels.cpu().numpy())

For each batch in the test set:

- Moves inputs to the appropriate device.

- Predicts logits without computing gradients: torch.no_grad().

- Converts logits to predictions: torch.argmax().

- Stores predictions and labels.

In [47]:
#Computes and prints the test set accuracy
accuracy = accuracy_metric.compute(predictions=predictions, references=references)
print(f"Test Accuracy: {accuracy['accuracy']}")

Test Accuracy: 0.7586206896551724


<b> Save Model </b>

In [49]:
model_pr.save_pretrained('sentiment_model_patient_reviews')
tokenizer.save_pretrained('sentiment_model_patient_reviews')

('sentiment_model_patient_reviews\\tokenizer_config.json',
 'sentiment_model_patient_reviews\\special_tokens_map.json',
 'sentiment_model_patient_reviews\\vocab.txt',
 'sentiment_model_patient_reviews\\added_tokens.json')