In [11]:
import pandas as pd
import numpy as np

# Train a Bert Model
### First load the combined dataset. Includes questions from Yaha, Hadifar and a Kaggle Dataset

In [139]:
combined_df= pd.read_excel('../data/combined_questions.xlsx')

In [140]:
combined_df

Unnamed: 0,question,label
0,can you list the ingredients needed for a clas...,0
1,what are the top five tourist attractions in p...,0
2,list five common programming languages.,0
3,name three types of renewable energy sources.,0
4,can you list the planets in our solar system?,0
...,...,...
2431,what are thrombocytes more accurately called?,0
2432,what is the first step in hemostasis?,0
2433,during which is prothrombin converted to throm...,0
2434,what is the process called in which antibodies...,0


### Load Questions from the first and second surveys, to be used after for prediction

In [141]:
secondary_questions = pd.read_excel('../data/secondaryquestionsonly.xlsx')
pka_questions = pd.read_excel('../data/pKa_blooms.xlsx')

In [142]:
pka_questions = pka_questions.drop_duplicates(subset='question_ID', keep='first')


In [143]:
secondary_questions.columns
pka_questions.columns

Index(['Unnamed: 0', 'question', 'label', 'question_ID'], dtype='object')

In [144]:
pka_questions.head()

Unnamed: 0.1,Unnamed: 0,question,label,question_ID
0,0,What is the acid dissociation constant Ka an...,1,1
3,3,Given that the dissociation constant (Ka) of t...,2,4
6,6,Given that the acid dissociation constant (Ka)...,3,5
9,9,Which of the following represents the acid dis...,0,7
12,12,Which statement correctly describes the relati...,0,9


In [145]:
# Drop the unnamed column
secondary_questions = secondary_questions.drop(columns=['Unnamed: 0'])
pka_questions = pka_questions.drop(columns=['Unnamed: 0'])

In [146]:
secondary_questions

Unnamed: 0,question,label,question_ID
0,In this reaction which compound acts as the r...,2,34
1,Consider a scenario where Compound C is oxidiz...,1,32
2,In the context of biological redox reactions ...,0,27
3,Which assumption is necessary for biologists' ...,3,36
4,Which of the following best describes the role...,2,35
...,...,...,...
59,Which of the following molecules is directly p...,0,18
60,Which of the following lists the correct overa...,0,69
61,Consider a scenario where a cell's concentrati...,1,70
62,Which of the following best explains why the m...,2,73


In [147]:
# Make sure all questions have been lower cased and stripped of white space
combined_df['question'] = combined_df['question'].str.strip().str.lower()
secondary_questions['question'] = secondary_questions['question'].str.strip().str.lower()
pka_questions['question']=pka_questions['question'].str.strip().str.lower()

In [148]:
questions = combined_df['question'].tolist()
labels = combined_df['label'].tolist()

In [149]:
s_question_list =secondary_questions['question'].tolist()
secondary_labels = secondary_questions['label'].tolist()
pka_questions_list = pka_questions['question'].tolist()
pka_labels = pka_questions['label'].tolist()

## Training Bert Model

In [150]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
encodings = tokenizer(questions, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']

In [151]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# Convert labels to tensors
labels = torch.tensor(labels)

# Create the dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)



In [152]:
from transformers import BertForSequenceClassification, AdamW
#!pip install tensorboard
from torch.utils.tensorboard import SummaryWriter
# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [153]:
# Device setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
writer = SummaryWriter('runs/bert_training_experiment')



In [154]:
import torch
from transformers import AdamW
from sklearn.metrics import accuracy_score
from torch.utils.tensorboard import SummaryWriter

# Training parameters
epochs = 5
learning_rates = [2e-5]  # List of learning rates to test
best_val_accuracy = 0
best_learning_rate = None
best_model_state_dict = None

# Loop over each learning rate
for learning_rate in learning_rates:
    print(f"\nStarting training with learning rate: {learning_rate}")

    # Initialize model and optimizer
    # Load BERT with a classification head
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4) # Replace with your model class
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        print(f"Starting epoch {epoch+1}")
        model.train()  # Set model to training mode
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
            model.zero_grad()

            # Forward pass
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

            # Print progress every 10 steps
            if step % 10 == 0 and step != 0:
                avg_loss = total_loss / (step + 1)
                print(f'Epoch: {epoch + 1}, Step: {step}, Loss: {avg_loss:.4f}')

        # Average loss for this epoch
        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1} average loss: {avg_loss:.4f}')

        # Evaluate on validation set
        model.eval()  # Set model to evaluation mode
        val_labels = []
        val_preds = []
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            val_labels.extend(b_labels.cpu().numpy())
            val_preds.extend(predictions.cpu().numpy())

        # Calculate accuracy
        val_accuracy = accuracy_score(val_labels, val_preds)
        print(f'Epoch {epoch + 1} Validation Accuracy: {val_accuracy:.4f}')

        # Track the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_learning_rate = learning_rate
            best_model_state_dict = model.state_dict()

# Save the best model's state_dict
torch.save(best_model_state_dict, 'best_model.pth')
print(f"Training complete! Best learning rate: {best_learning_rate}. Best validation accuracy: {best_val_accuracy:.4f}")

# To load the best model later, you can use:
# model.load_state_dict(torch.load('best_model.pth'))



Starting training with learning rate: 2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1
Epoch: 1, Step: 10, Loss: 1.3107
Epoch: 1, Step: 20, Loss: 1.2501
Epoch: 1, Step: 30, Loss: 1.2290
Epoch: 1, Step: 40, Loss: 1.1774
Epoch: 1, Step: 50, Loss: 1.1319
Epoch: 1, Step: 60, Loss: 1.0900
Epoch 1 average loss: 1.0900
Epoch 1 Validation Accuracy: 0.7275
Starting epoch 2
Epoch: 2, Step: 10, Loss: 0.7682
Epoch: 2, Step: 20, Loss: 0.7941
Epoch: 2, Step: 30, Loss: 0.7417
Epoch: 2, Step: 40, Loss: 0.7208
Epoch: 2, Step: 50, Loss: 0.6802
Epoch: 2, Step: 60, Loss: 0.6785
Epoch 2 average loss: 0.6785
Epoch 2 Validation Accuracy: 0.8258
Starting epoch 3
Epoch: 3, Step: 10, Loss: 0.5336
Epoch: 3, Step: 20, Loss: 0.4712
Epoch: 3, Step: 30, Loss: 0.4405
Epoch: 3, Step: 40, Loss: 0.4499
Epoch: 3, Step: 50, Loss: 0.4353
Epoch: 3, Step: 60, Loss: 0.4150
Epoch 3 average loss: 0.4150
Epoch 3 Validation Accuracy: 0.8648
Starting epoch 4
Epoch: 4, Step: 10, Loss: 0.2606
Epoch: 4, Step: 20, Loss: 0.2752
Epoch: 4, Step: 30, Loss: 0.2908
Epoch: 4, Step: 40, Loss: 0.2893
Epoch: 4, S

In [77]:
model.eval()
eval_loss = 0
eval_accuracy = 0
for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).flatten()
    accuracy = (preds == b_labels).cpu().numpy().mean() * 100
    eval_accuracy += accuracy

avg_eval_accuracy = eval_accuracy / len(val_dataloader)
print(f'Final validation accuracy: {avg_eval_accuracy}')

Final validation accuracy: 86.328125


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Predicting on a new dataset


In [78]:
import pandas as pd
from transformers import BertTokenizer
import tensorflow as tf

In [88]:


#Only need the questions column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
def encode_examples(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128, return_tensors='tf')

encoded_data = encode_examples(s_question_list)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert TensorFlow tensors to NumPy arrays first, then to PyTorch tensors
input_ids_pt = torch.tensor(input_ids.numpy())
attention_mask_pt = torch.tensor(attention_mask.numpy())

In [91]:
predicted_classes

tensor([1, 2, 0, 1, 0, 0, 0, 3, 0, 1, 0, 1, 0, 2, 2, 3, 0, 0, 2, 2, 2, 1, 2, 0,
        2, 2, 0, 0, 0, 3, 2, 1, 2, 0, 0, 2, 0, 3, 1, 1, 3, 2, 0, 0, 0, 0, 2, 2,
        0, 0, 2, 2, 0, 0, 3, 2, 0, 0, 0, 0, 0, 2, 2, 2])

In [93]:
import torch

with torch.no_grad():  # Disable gradient calculation for inference
    predictions = model(input_ids=input_ids_pt, attention_mask=attention_mask_pt)
    logits = predictions.logits  # Extract the logits from the output
    predicted_classes = torch.argmax(logits, axis=1)  # Get the index of the max logit for each sample

# Map predicted classes back to labels
predicted_labels = [i for i in predicted_classes.numpy()]

# Add predictions to the DataFrame
secondary_questions['predicted_label'] = predicted_labels

secondary_questions

Unnamed: 0,question,label,question_ID,predicted_label
0,In this reaction which compound acts as the r...,2,34,1
1,Consider a scenario where Compound C is oxidiz...,1,32,2
2,In the context of biological redox reactions ...,0,27,0
3,Which assumption is necessary for biologists' ...,3,36,1
4,Which of the following best describes the role...,2,35,0
...,...,...,...,...
59,Which of the following molecules is directly p...,0,18,0
60,Which of the following lists the correct overa...,0,69,0
61,Consider a scenario where a cell's concentrati...,1,70,2
62,Which of the following best explains why the m...,2,73,2


In [95]:


#Only need the questions column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
def encode_examples(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128, return_tensors='tf')

encoded_data = encode_examples(pka_questions_list)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert TensorFlow tensors to NumPy arrays first, then to PyTorch tensors
input_ids_pt = torch.tensor(input_ids.numpy())
attention_mask_pt = torch.tensor(attention_mask.numpy())

In [96]:


#Only need the questions column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
def encode_examples(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128, return_tensors='tf')

encoded_data = encode_examples(pka_questions_list)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert TensorFlow tensors to NumPy arrays first, then to PyTorch tensors
input_ids_pt = torch.tensor(input_ids.numpy())
attention_mask_pt = torch.tensor(attention_mask.numpy())
import torch

with torch.no_grad():  # Disable gradient calculation for inference
    predictions = model(input_ids=input_ids_pt, attention_mask=attention_mask_pt)
    logits = predictions.logits  # Extract the logits from the output
    predicted_classes = torch.argmax(logits, axis=1)  # Get the index of the max logit for each sample

# Map predicted classes back to labels
predicted_labels = [i for i in predicted_classes.numpy()]

# Add predictions to the DataFrame
pka_questions['predicted_label'] = predicted_labels

pka_questions

Unnamed: 0,question,label,question_ID,predicted_label
0,What is the acid dissociation constant Ka an...,1,1,0
1,What is the acid dissociation constant Ka an...,1,1,0
2,What is the acid dissociation constant Ka an...,1,1,0
3,Given that the dissociation constant (Ka) of t...,2,4,2
4,Given that the dissociation constant (Ka) of t...,2,4,2
5,Given that the dissociation constant (Ka) of t...,2,4,2
6,Given that the acid dissociation constant (Ka)...,3,5,2
7,Given that the acid dissociation constant (Ka)...,3,5,2
8,Given that the acid dissociation constant (Ka)...,3,5,2
9,Which of the following represents the acid dis...,0,7,0


In [97]:
pka_questions.head()

Unnamed: 0,question,label,question_ID,predicted_label
0,What is the acid dissociation constant Ka an...,1,1,0
1,What is the acid dissociation constant Ka an...,1,1,0
2,What is the acid dissociation constant Ka an...,1,1,0
3,Given that the dissociation constant (Ka) of t...,2,4,2
4,Given that the dissociation constant (Ka) of t...,2,4,2


In [98]:
from sklearn.metrics import accuracy_score
from scipy.stats import pearsonr

In [100]:
accuracy = accuracy_score(pka_questions['label'], pka_questions['predicted_label'])
print(f'Accuracy on pKa: {accuracy}')


Accuracy on pKa: 0.25


In [118]:

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if pka_questions['label'].dtype == 'object' or pka_questions['predicted_label'].dtype == 'object':
    pka_questions['label'] = pka_questions['label'].astype('category').cat.codes
    pka_questions['predicted_label'] = pka_questions['predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(pka_questions['label'], pka_questions['predicted_label'])
print(f'Correlation on pKa: {correlation}')

kappa = cohen_kappa_score(pka_questions['label'], pka_questions['predicted_label'])
print(f"Cohen's Kappa on pka: {kappa}")


Correlation on pKa: 0.31919710595935896
Cohen's Kappa on pka: 0.0


In [103]:
accuracy = accuracy_score(secondary_questions['label'], secondary_questions['predicted_label'])
print(f'Accuracy on first four sections: {accuracy}')


Accuracy on first four sections: 0.484375


In [116]:

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if secondary_questions['label'].dtype == 'object' or secondary_questions['predicted_label'].dtype == 'object':
    secondary_questions['label'] = secondary_questions['label'].astype('category').cat.codes
    secondary_questions['predicted_label'] = secondary_questions['predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(secondary_questions['label'], secondary_questions['predicted_label'])
print(f'Correlation on first four sections: {correlation}')


from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(secondary_questions['label'], secondary_questions['predicted_label'])
print(f"Cohen's Kappa on first four sections: {kappa}")

Correlation on first four sections: 0.4426422973348012
Cohen's Kappa on first four sections: 0.3125


In [115]:
# Now a concatenated set


In [106]:
total_predictions = pd.concat([secondary_questions, pka_questions], axis=0, ignore_index=True)

In [107]:
total_predictions

Unnamed: 0,question,label,question_ID,predicted_label
0,In this reaction which compound acts as the r...,2,34,1
1,Consider a scenario where Compound C is oxidiz...,1,32,2
2,In the context of biological redox reactions ...,0,27,0
3,Which assumption is necessary for biologists' ...,3,36,1
4,Which of the following best describes the role...,2,35,0
...,...,...,...,...
107,If the enzyme is placed in a solution with a p...,2,26,2
108,If the enzyme is placed in a solution with a p...,2,26,2
109,A protein contains a carboxylic acid functiona...,3,27,2
110,A protein contains a carboxylic acid functiona...,3,27,2


In [112]:
accuracy = accuracy_score(total_predictions['label'], total_predictions['predicted_label'])
print(f'Accuracy on total: {accuracy}')


Accuracy on total: 0.38392857142857145


In [113]:

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if total_predictions['label'].dtype == 'object' or total_predictions['predicted_label'].dtype == 'object':
    total_predictions['label'] = total_predictions['label'].astype('category').cat.codes
    total_predictions['predicted_label'] = total_predictions['predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(total_predictions['label'], total_predictions['predicted_label'])
print(f'Correlation on totals: {correlation}')

Correlation on totals: 0.3873358897824858


In [114]:
from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(total_predictions['label'], total_predictions['predicted_label'])
print(f"Cohen's Kappa: {kappa}")

Cohen's Kappa: 0.1785714285714286


In [120]:
total_predictions.to_excel('total_predictions_bert.xlsx')

## Train only on hadifar dataset

In [121]:
hadifar = pd.read_excel('../data/hadifar_questions.xlsx')

Unnamed: 0,question,label
0,"In a study, a group of 10-year-old boys are fe...",3
1,Which materials are considered secondary data?,0
2,What method did researchers John Mihelich and ...,0
3,Why is choosing a random sample an effective w...,1
4,Which research approach is best suited to the ...,0
...,...,...
898,What are thrombocytes more accurately called?,0
899,What is the first step in hemostasis?,0
900,During which is prothrombin converted to throm...,0
901,What is the process called in which antibodies...,0


In [123]:
hadifar['question'] = hadifar['question'].str.strip().str.lower()

In [125]:
h_questions = hadifar['question'].tolist()
h_labels = hadifar['label'].tolist()

In [126]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
encodings = tokenizer(h_questions, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']

In [128]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

# Convert labels to tensors
labels = torch.tensor(h_labels)

# Create the dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)



In [129]:
# Device setup
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
writer = SummaryWriter('runs/bert_training_experiment')



In [130]:
import torch
from transformers import AdamW
from sklearn.metrics import accuracy_score
from torch.utils.tensorboard import SummaryWriter

# Training parameters
epochs = 5
learning_rates = [2e-5]  # List of learning rates to test
best_val_accuracy = 0
best_learning_rate = None
best_model_state_dict = None

# Loop over each learning rate
for learning_rate in learning_rates:
    print(f"\nStarting training with learning rate: {learning_rate}")

    # Initialize model and optimizer
    # Load BERT with a classification head
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4) # Replace with your model class
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        print(f"Starting epoch {epoch+1}")
        model.train()  # Set model to training mode
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
            model.zero_grad()

            # Forward pass
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

            # Print progress every 10 steps
            if step % 10 == 0 and step != 0:
                avg_loss = total_loss / (step + 1)
                print(f'Epoch: {epoch + 1}, Step: {step}, Loss: {avg_loss:.4f}')

        # Average loss for this epoch
        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1} average loss: {avg_loss:.4f}')

        # Evaluate on validation set
        model.eval()  # Set model to evaluation mode
        val_labels = []
        val_preds = []
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = [item.to(device) for item in batch]
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            val_labels.extend(b_labels.cpu().numpy())
            val_preds.extend(predictions.cpu().numpy())

        # Calculate accuracy
        val_accuracy = accuracy_score(val_labels, val_preds)
        print(f'Epoch {epoch + 1} Validation Accuracy: {val_accuracy:.4f}')

        # Track the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_learning_rate = learning_rate
            best_model_state_dict = model.state_dict()

# Save the best model's state_dict
torch.save(best_model_state_dict, 'best_model.pth')
print(f"Training complete! Best learning rate: {best_learning_rate}. Best validation accuracy: {best_val_accuracy:.4f}")

# To load the best model later, you can use:
# model.load_state_dict(torch.load('best_model.pth'))



Starting training with learning rate: 2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1
Epoch: 1, Step: 10, Loss: 1.1515
Epoch: 1, Step: 20, Loss: 1.0225
Epoch 1 average loss: 1.0111
Epoch 1 Validation Accuracy: 0.7403
Starting epoch 2
Epoch: 2, Step: 10, Loss: 0.7977
Epoch: 2, Step: 20, Loss: 0.8319
Epoch 2 average loss: 0.8295
Epoch 2 Validation Accuracy: 0.7403
Starting epoch 3
Epoch: 3, Step: 10, Loss: 0.7782
Epoch: 3, Step: 20, Loss: 0.7675
Epoch 3 average loss: 0.7916
Epoch 3 Validation Accuracy: 0.7403
Starting epoch 4
Epoch: 4, Step: 10, Loss: 0.7611
Epoch: 4, Step: 20, Loss: 0.7229
Epoch 4 average loss: 0.7306
Epoch 4 Validation Accuracy: 0.7403
Starting epoch 5
Epoch: 5, Step: 10, Loss: 0.6622
Epoch: 5, Step: 20, Loss: 0.6284
Epoch 5 average loss: 0.6286
Epoch 5 Validation Accuracy: 0.7514
Training complete! Best learning rate: 2e-05. Best validation accuracy: 0.7514


In [132]:
## Now add h_predictions


#Only need the questions column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
def encode_examples(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128, return_tensors='tf')

encoded_data = encode_examples(s_question_list)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert TensorFlow tensors to NumPy arrays first, then to PyTorch tensors
input_ids_pt = torch.tensor(input_ids.numpy())
attention_mask_pt = torch.tensor(attention_mask.numpy())



In [133]:
import torch

with torch.no_grad():  # Disable gradient calculation for inference
    predictions = model(input_ids=input_ids_pt, attention_mask=attention_mask_pt)
    logits = predictions.logits  # Extract the logits from the output
    predicted_classes = torch.argmax(logits, axis=1)  # Get the index of the max logit for each sample

# Map predicted classes back to labels
predicted_labels = [i for i in predicted_classes.numpy()]

# Add predictions to the DataFrame
secondary_questions['h_predicted_label'] = predicted_labels

secondary_questions

Unnamed: 0,question,label,question_ID,predicted_label,h_predicted_label
0,In this reaction which compound acts as the r...,2,34,1,1
1,Consider a scenario where Compound C is oxidiz...,1,32,2,0
2,In the context of biological redox reactions ...,0,27,0,0
3,Which assumption is necessary for biologists' ...,3,36,1,0
4,Which of the following best describes the role...,2,35,0,1
...,...,...,...,...,...
59,Which of the following molecules is directly p...,0,18,0,0
60,Which of the following lists the correct overa...,0,69,0,0
61,Consider a scenario where a cell's concentrati...,1,70,2,0
62,Which of the following best explains why the m...,2,73,2,1


In [134]:


#Only need the questions column 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the questions
def encode_examples(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128, return_tensors='tf')

encoded_data = encode_examples(pka_questions_list)
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert TensorFlow tensors to NumPy arrays first, then to PyTorch tensors
input_ids_pt = torch.tensor(input_ids.numpy())
attention_mask_pt = torch.tensor(attention_mask.numpy())
import torch

with torch.no_grad():  # Disable gradient calculation for inference
    predictions = model(input_ids=input_ids_pt, attention_mask=attention_mask_pt)
    logits = predictions.logits  # Extract the logits from the output
    predicted_classes = torch.argmax(logits, axis=1)  # Get the index of the max logit for each sample

# Map predicted classes back to labels
predicted_labels = [i for i in predicted_classes.numpy()]

# Add predictions to the DataFrame
pka_questions['h_predicted_label'] = predicted_labels

pka_questions

Unnamed: 0,question,label,question_ID,predicted_label,h_predicted_label
0,What is the acid dissociation constant Ka an...,1,1,0,0
1,What is the acid dissociation constant Ka an...,1,1,0,0
2,What is the acid dissociation constant Ka an...,1,1,0,0
3,Given that the dissociation constant (Ka) of t...,2,4,2,0
4,Given that the dissociation constant (Ka) of t...,2,4,2,0
5,Given that the dissociation constant (Ka) of t...,2,4,2,0
6,Given that the acid dissociation constant (Ka)...,3,5,2,0
7,Given that the acid dissociation constant (Ka)...,3,5,2,0
8,Given that the acid dissociation constant (Ka)...,3,5,2,0
9,Which of the following represents the acid dis...,0,7,0,1


In [135]:
accuracy = accuracy_score(pka_questions['label'], pka_questions['h_predicted_label'])
print(f'Accuracy on pKa: {accuracy}')

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if pka_questions['label'].dtype == 'object' or pka_questions['predicted_label'].dtype == 'object':
    pka_questions['label'] = pka_questions['label'].astype('category').cat.codes
    pka_questions['h_predicted_label'] = pka_questions['h_predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(pka_questions['label'], pka_questions['h_predicted_label'])
print(f'Correlation on pKa: {correlation}')

kappa = cohen_kappa_score(pka_questions['label'], pka_questions['h_predicted_label'])
print(f"Cohen's Kappa on pka: {kappa}")


Accuracy on pKa: 0.125
Correlation on pKa: -0.21483446221182984
Cohen's Kappa on pka: -0.16666666666666674


In [136]:
accuracy = accuracy_score(pka_questions['label'], pka_questions['h_predicted_label'])
print(f'Accuracy on pKa: {accuracy}')

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if secondary_questions['label'].dtype == 'object' or secondary_questions['predicted_label'].dtype == 'object':
    secondary_questions['label'] = secondary_questions['label'].astype('category').cat.codes
    secondary_questions['h_predicted_label'] = secondary_questions['h_predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(secondary_questions['label'], secondary_questions['h_predicted_label'])
print(f'Correlation on pKa: {correlation}')

kappa = cohen_kappa_score(secondary_questions['label'], secondary_questions['h_predicted_label'])
print(f"Cohen's Kappa on pka: {kappa}")


Accuracy on pKa: 0.125
Correlation on pKa: 0.08451542547285168
Cohen's Kappa on pka: 0.0


In [137]:
total_predictions = pd.concat([secondary_questions, pka_questions], axis=0, ignore_index=True)
accuracy = accuracy_score(total_predictions['label'], total_predictions['h_predicted_label'])
print(f'Accuracy on total: {accuracy}')

# Calculate correlation (only makes sense if labels are numeric)
# If the labels are categorical, you should first encode them as integers
if total_predictions['label'].dtype == 'object' or total_predictions['predicted_label'].dtype == 'object':
    total_predictions['label'] = total_predictions['label'].astype('category').cat.codes
    total_predictions['h_predicted_label'] = total_predictions['h_predicted_label'].astype('category').cat.codes

# Pearson correlation
correlation, _ = pearsonr(total_predictions['label'], total_predictions['h_predicted_label'])
print(f'Correlation on total: {correlation}')

kappa = cohen_kappa_score(total_predictions['label'], total_predictions['h_predicted_label'])
print(f"Cohen's Kappa on total: {kappa}")

Accuracy on total: 0.19642857142857142
Correlation on total: -0.05564148840746572
Cohen's Kappa on total: -0.0714285714285714


In [138]:
total_predictions.to_excel('total_predictions_with_h.xlsx')