In [22]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [23]:
train_data = pd.read_excel("traindata.xlsx")
test_data = pd.read_excel("testdata.xlsx")

In [24]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # 2 labels for beliefs

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
belief_mapping = {
    "pcb": 0,
    "dcb": 1,
}

In [26]:
train_data["belief"] = train_data["blf_value"].map(belief_mapping)

In [27]:
train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)

In [38]:
def tokenize_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        text = ""
    return tokenizer.encode_plus(
        text,
        max_length=128,  # Adjust as needed
        truncation=True,
        padding='max_length',
        return_tensors='pt',
    )

In [39]:
def create_data_loader(df, batch_size, is_test=False):
    texts = df["text"].tolist()
    encoded_texts = [tokenize_text(text) for text in tqdm(texts, desc="Tokenizing")]
    input_ids = torch.cat([encoded_text["input_ids"] for encoded_text in encoded_texts], dim=0)
    attention_mask = torch.cat([encoded_text["attention_mask"] for encoded_text in encoded_texts], dim=0)
    if not is_test:
        labels = torch.tensor(df["belief"].values, dtype=torch.long)
        dataset = TensorDataset(input_ids, attention_mask, labels)
    else:
        dataset = TensorDataset(input_ids, attention_mask)
    return DataLoader(dataset, batch_size=batch_size, shuffle=not is_test)


In [40]:
batch_size = 8
lr = 2e-5
epochs = 3

In [41]:
train_loader = create_data_loader(train_set, batch_size)
val_loader = create_data_loader(val_set, batch_size)
test_loader = create_data_loader(test_data, batch_size, is_test=True)

Tokenizing: 100%|███████████████████████████| 552/552 [00:00<00:00, 3253.18it/s]
Tokenizing: 100%|███████████████████████████| 138/138 [00:00<00:00, 3924.56it/s]
Tokenizing: 100%|███████████████████████████| 690/690 [00:00<00:00, 7217.06it/s]


In [42]:
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()



In [44]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/Training"):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_train_loss / len(train_loader)
    
    model.eval()
    val_predictions = []
    val_labels = []
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1}/Validation"):
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            total_val_loss += outputs.loss.item()
            val_predictions.extend(logits.argmax(axis=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    avg_val_loss = total_val_loss / len(val_loader)
    
    # Calculate accuracy on validation set
    val_accuracy = accuracy_score(val_labels, val_predictions)
    
    print(f"Epoch {epoch + 1}/{epochs}, Avg Train Loss: {avg_train_loss:.4f}, Avg Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

Epoch 1/Training: 100%|█████████████████████████| 69/69 [01:19<00:00,  1.15s/it]
Epoch 1/Validation: 100%|███████████████████████| 18/18 [00:05<00:00,  3.39it/s]


Epoch 1/3, Avg Train Loss: 0.1380, Avg Val Loss: 0.2006, Val Accuracy: 0.9348


Epoch 2/Training: 100%|█████████████████████████| 69/69 [01:19<00:00,  1.15s/it]
Epoch 2/Validation: 100%|███████████████████████| 18/18 [00:05<00:00,  3.35it/s]


Epoch 2/3, Avg Train Loss: 0.0929, Avg Val Loss: 0.2802, Val Accuracy: 0.9058


Epoch 3/Training: 100%|█████████████████████████| 69/69 [01:19<00:00,  1.15s/it]
Epoch 3/Validation: 100%|███████████████████████| 18/18 [00:05<00:00,  3.39it/s]

Epoch 3/3, Avg Train Loss: 0.0827, Avg Val Loss: 0.3890, Val Accuracy: 0.9130





In [45]:
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids, attention_mask = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        test_predictions.extend(logits.argmax(axis=1).cpu().numpy())

Testing: 100%|██████████████████████████████████| 87/87 [00:26<00:00,  3.28it/s]


In [46]:
predicted_beliefs = [list(belief_mapping.keys())[list(belief_mapping.values()).index(belief)]
                     for belief in test_predictions]

In [47]:
test_results = pd.DataFrame({
    "text": test_data["text"],
    "predicted_belief": predicted_beliefs,
})

In [48]:
test_results.to_excel("test_results_beliefs.xlsx", index=False)