In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import time
import os
import time



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define a custom dataset class to load and preprocess data
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.reset_index(drop=True)  # Reset index to ensure consistent indexing
        self.labels = labels.reset_index(drop=True)  # Reset index to ensure consistent indexing
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }



In [3]:
# Load data
data = pd.read_csv("filtered_ceas.csv")

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['body'], data['label'], test_size=0.2, random_state=42)


In [4]:

# from transformers import BertTokenizer, BertModel


start_time = time.time()

# Load pre-trained BERT tokenizer and model
print("Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("Tokenizer loaded.")
print("Loading model...")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(data['label'])))
print("Model loaded.")

end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds")

# Define constants
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5
EPS = 1e-8



Loading tokenizer...
Tokenizer loaded.
Loading model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.
Total time taken: 3.69016432762146 seconds


In [5]:
# Create dataloaders for training and validation sets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)


# Check dataset lengths
print(f"Training dataset length: {len(train_dataset)}")
print(f"Validation dataset length: {len(val_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


# Check dataloader lengths
print(f"Training dataloader length: {len(train_loader)}")
print(f"Validation dataloader length: {len(val_loader)}")



Training dataset length: 25153
Validation dataset length: 6289
Training dataloader length: 787
Validation dataloader length: 197


In [6]:
# Fine-tune BERT embeddings
train_embeddings = []
train_labels_list = []

model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
# Train SVM classifier
svm_model = SVC()
svm_model.fit(train_embeddings, train_labels)


In [14]:

# Evaluate the fine-tuned model on the validation set
val_embeddings = []
val_labels_list = []

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:

CHECKPOINT_PATH = 'val_checkpoint.pth'
CHECKPOINT_INTERVAL = 10  # Save checkpoint every 10 batches

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load checkpoint if it exists
if os.path.exists(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    start_batch = checkpoint['batch']
    val_embeddings = checkpoint['embeddings']
    val_labels_list = checkpoint['labels']
else:
    start_batch = 0
    val_embeddings = []
    val_labels_list = []

# Generate validation embeddings
model.eval()

with torch.no_grad():
    for i, batch in enumerate(tqdm(val_loader, desc="Generating validation embeddings", initial=start_batch)):
        if i < start_batch:
            continue

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch.get('token_type_ids', None)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)

        # Enable output_hidden_states to get embeddings
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        
        # Extract the last hidden state
        hidden_states = outputs.hidden_states
        last_hidden_state = hidden_states[-1]  # Get the last hidden state
        
        # Pool the output as needed (mean pooling in this case)
        pooled_output = torch.mean(last_hidden_state, dim=1)  # Mean pooling over the sequence length
        
        val_embeddings.append(pooled_output.cpu().numpy())
        val_labels_list.append(batch['label'].cpu().numpy())

        # Save checkpoint at intervals
        if (i + 1) % CHECKPOINT_INTERVAL == 0:
            checkpoint = {
                'batch': i + 1,
                'embeddings': val_embeddings,
                'labels': val_labels_list
            }
            torch.save(checkpoint, CHECKPOINT_PATH)
            print(f"Checkpoint saved at batch {i + 1}")

# Convert lists to numpy arrays and concatenate
val_embeddings = np.concatenate(val_embeddings, axis=0)
val_labels = np.concatenate(val_labels_list, axis=0)

# Convert numpy arrays to tensors
val_embeddings = torch.tensor(val_embeddings, dtype=torch.float)
val_labels = torch.tensor(val_labels, dtype=torch.long)

print("Validation embeddings shape:", val_embeddings.shape)
print("Validation labels shape:", val_labels.shape)

# Remove checkpoint after successful completion
if os.path.exists(CHECKPOINT_PATH):
    os.remove(CHECKPOINT_PATH)
    print("Checkpoint removed")


Generating validation embeddings:   5%|▌         | 10/197 [01:39<32:18, 10.36s/it]

Checkpoint saved at batch 10


Generating validation embeddings:  10%|█         | 20/197 [03:22<30:15, 10.26s/it]

Checkpoint saved at batch 20


Generating validation embeddings:  15%|█▌        | 30/197 [05:06<29:42, 10.67s/it]

Checkpoint saved at batch 30


Generating validation embeddings:  20%|██        | 40/197 [06:58<28:08, 10.75s/it]

Checkpoint saved at batch 40


Generating validation embeddings:  25%|██▌       | 50/197 [08:42<25:25, 10.38s/it]

Checkpoint saved at batch 50


Generating validation embeddings:  30%|███       | 60/197 [10:24<23:29, 10.29s/it]

Checkpoint saved at batch 60


Generating validation embeddings:  36%|███▌      | 70/197 [12:08<21:56, 10.37s/it]

Checkpoint saved at batch 70


Generating validation embeddings:  41%|████      | 80/197 [13:50<20:07, 10.32s/it]

Checkpoint saved at batch 80


Generating validation embeddings:  46%|████▌     | 90/197 [15:35<18:43, 10.50s/it]

Checkpoint saved at batch 90


Generating validation embeddings:  51%|█████     | 100/197 [17:21<17:03, 10.55s/it]

Checkpoint saved at batch 100


Generating validation embeddings:  56%|█████▌    | 110/197 [19:06<15:24, 10.62s/it]

Checkpoint saved at batch 110


Generating validation embeddings:  61%|██████    | 120/197 [20:50<13:37, 10.62s/it]

Checkpoint saved at batch 120


Generating validation embeddings:  66%|██████▌   | 130/197 [22:52<13:35, 12.17s/it]

Checkpoint saved at batch 130


Generating validation embeddings:  71%|███████   | 140/197 [25:02<11:09, 11.75s/it]

Checkpoint saved at batch 140


Generating validation embeddings:  76%|███████▌  | 150/197 [26:46<08:20, 10.65s/it]

Checkpoint saved at batch 150


Generating validation embeddings:  81%|████████  | 160/197 [28:31<06:26, 10.45s/it]

Checkpoint saved at batch 160


Generating validation embeddings:  86%|████████▋ | 170/197 [30:13<04:40, 10.39s/it]

Checkpoint saved at batch 170


Generating validation embeddings:  91%|█████████▏| 180/197 [31:56<02:58, 10.51s/it]

Checkpoint saved at batch 180


Generating validation embeddings:  96%|█████████▋| 190/197 [33:37<01:12, 10.30s/it]

Checkpoint saved at batch 190


Generating validation embeddings: 100%|██████████| 197/197 [34:44<00:00, 10.58s/it]

Validation embeddings shape: torch.Size([6289, 768])
Validation labels shape: torch.Size([6289])





In [17]:

# Optimizer, scheduler, and criterion
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPS)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()

# Checkpoint setup
CHECKPOINT_PATH = 'bert_fine_tuned_checkpoint.pth'
CHECKPOINT_INTERVAL = len(train_loader) // 2  # Example, adjust as needed

# Load checkpoint if it exists
start_epoch = 0
if os.path.exists(CHECKPOINT_PATH):
    checkpoint = torch.load(CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    print(f"Checkpoint loaded, starting from epoch {start_epoch + 1}")

# Training loop
for epoch in range(start_epoch, EPOCHS):
    model.train()
    total_train_loss = 0
    batch_index = 0  # Track batch index separately

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{EPOCHS}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        batch_index += 1  # Increment batch index

        # Save checkpoint at intervals
        if batch_index % CHECKPOINT_INTERVAL == 0 or batch_index == len(train_loader):
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }
            torch.save(checkpoint, CHECKPOINT_PATH)
            print(f"Checkpoint saved at epoch {epoch + 1} and batch {batch_index}")

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Training loss for epoch {epoch + 1}: {avg_train_loss}")

# Validation loop
model.eval()
val_preds = []
val_labels_list = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        val_preds.extend(preds.cpu().numpy())
        val_labels_list.extend(labels.cpu().numpy())

# Calculate evaluation metrics
val_labels = np.array(val_labels_list)
accuracy = accuracy_score(val_labels, val_preds)
precision = precision_score(val_labels, val_preds, average='weighted')
recall = recall_score(val_labels, val_preds, average='weighted')
f1 = f1_score(val_labels, val_preds, average='weighted')

print("Validation Accuracy:", accuracy)
print("Validation Precision:", precision)
print("Validation Recall:", recall)
print("Validation F1-score:", f1)

# Calculate and print confusion matrix
conf_matrix = confusion_matrix(val_labels, val_preds)
print("Confusion Matrix:\n", conf_matrix)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(len(set(data['label']))), yticklabels=range(len(set(data['label']))))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

Training Epoch 1/3:  27%|██▋       | 210/787 [6:34:40<7:37:11, 47.54s/it]    

In [None]:

final_model_path = 'fine_tuned_bert_model.pth'
torch.save(model.state_dict(), final_model_path)

import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Define the device
device = torch.device("cuda")

# Load the model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels as needed
model.load_state_dict(torch.load('fine_tuned_bert_model.pth'))
# model.to(device)
model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

