In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim
from tqdm.auto import tqdm

# Custom BERT Classifier
class CustomBERTClassifier(nn.Module):
    def __init__(self, num_labels, layer_num):
        super(CustomBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.layer_num = layer_num
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states[self.layer_num]
        pooled_output = hidden_states[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

# Load CSV data
df = pd.read_csv('/home/pgajo/working/incels/data/datasets/English/Incels.is/IFD-EN-5203.csv')  # Replace with your CSV file path
texts = df['text'].tolist()
labels = df['hs'].tolist()

# Tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoding = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

# Create DataLoader
dataset = TensorDataset(input_ids, attention_mask, torch.tensor(labels))
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model
layer_num = 5  # Replace with the layer number you want
model = CustomBERTClassifier(num_labels=2, layer_num=layer_num)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(3):  # Number of epochs
    model.train()
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            val_loss += criterion(outputs, labels).item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = correct / val_size
    print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")


  0%|          | 0/261 [00:00<?, ?it/s]

Epoch: 0, Loss: 0.8520885705947876
Epoch: 0, Loss: 0.6315106749534607
Epoch: 0, Loss: 0.6109839081764221
Epoch: 0, Loss: 0.6071915626525879
Epoch: 0, Loss: 0.6629741787910461
Epoch: 0, Loss: 0.6000370979309082
Epoch: 0, Loss: 0.6230219602584839
Epoch: 0, Loss: 0.6811110377311707
Epoch: 0, Loss: 0.647298276424408
Epoch: 0, Loss: 0.6276841759681702
Epoch: 0, Loss: 0.7067749500274658
Epoch: 0, Loss: 0.6282312273979187
Epoch: 0, Loss: 0.7409138679504395
Epoch: 0, Loss: 0.672477126121521
Epoch: 0, Loss: 0.7697005271911621
Epoch: 0, Loss: 0.8096311092376709
Epoch: 0, Loss: 0.7744305729866028
Epoch: 0, Loss: 0.6385104656219482
Epoch: 0, Loss: 0.6484099626541138
Epoch: 0, Loss: 0.6389594674110413
Epoch: 0, Loss: 0.5553198456764221
Epoch: 0, Loss: 0.6336961388587952
Epoch: 0, Loss: 0.6712646484375
Epoch: 0, Loss: 0.6729068160057068
Epoch: 0, Loss: 0.5955810546875
Epoch: 0, Loss: 0.6686503291130066
Epoch: 0, Loss: 0.618746817111969
Epoch: 0, Loss: 0.6928828358650208
Epoch: 0, Loss: 0.72892636060

KeyboardInterrupt: 