In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [None]:
# Define your dataset class
class ParaphraseDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.data = []
        self.tokenizer = tokenizer
        df = pd.read_csv(file_path)
        df = df.dropna(subset=['label'])  # Remove rows with missing labels
        for index, row in df.iterrows():
            sentence1, sentence2, label = row['sentence1'], row['sentence2'], row['label']
            input_encoding = tokenizer(sentence1, sentence2, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
            label = int(label)
            self.data.append((input_encoding, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load datasets
train_dataset = ParaphraseDataset('train.csv', tokenizer, max_length=128)
dev_dataset = ParaphraseDataset('dev.csv', tokenizer, max_length=128)
test_dataset = ParaphraseDataset('test.csv', tokenizer, max_length=128)

# Define training parameters
batch_size = 32
epochs = 3
learning_rate = 2e-5

# Define dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Training loop
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
        input_ids = batch[0]['input_ids'].squeeze().to(device)
        attention_mask = batch[0]['attention_mask'].squeeze().to(device)
        labels = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation loop
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in tqdm(dev_dataloader, desc="Evaluation"):
        input_ids = batch[0]['input_ids'].squeeze().to(device)
        attention_mask = batch[0]['attention_mask'].squeeze().to(device)
        labels = batch[1].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print("Dev Accuracy:", accuracy)

# Testing loop
predictions = []
true_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch[0]['input_ids'].squeeze().to(device)
        attention_mask = batch[0]['attention_mask'].squeeze().to(device)
        labels = batch[1].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print("Test Accuracy:", accuracy)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens 

Dev Accuracy: 0.9195


Testing: 100%|██████████| 63/63 [00:13<00:00,  4.67it/s]

Test Accuracy: 0.934





In [None]:
# Save the trained model in the current directory
output_model_path = "./bert_paraphrase_model"
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)
print("Model saved successfully at:", output_model_path)

Model saved successfully at: ./bert_paraphrase_model
