In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bnpc-a-gold-standard-bangla-paraphrase-corpus/BnPC_val.csv
/kaggle/input/bnpc-a-gold-standard-bangla-paraphrase-corpus/BnPC_train.csv
/kaggle/input/bnpc-a-gold-standard-bangla-paraphrase-corpus/BnPC_test.csv


In [2]:
!pip install torch transformers pandas scikit-learn



In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

In [4]:
df = pd.read_csv('/kaggle/input/bnpc-a-gold-standard-bangla-paraphrase-corpus/BnPC_train.csv')

In [5]:
# Define the model
class BERTBiLSTMAttention(nn.Module):
    def __init__(self, bert_model_name='csebuetnlp/banglabert', hidden_size=128, num_layers=2):
        super(BERTBiLSTMAttention, self).__init__()
        
        # Load the pre-trained BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # BiLSTM layer
        self.bilstm = nn.LSTM(input_size=self.bert.config.hidden_size, hidden_size=hidden_size, 
                              num_layers=num_layers, batch_first=True, bidirectional=True)
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_size * 2, 1)
        
        # Fully connected layer for final classification
        self.fc = nn.Linear(hidden_size * 4, 1)  # 2*hidden_size for each sentence, hence *4 for concatenation
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        # BERT encoding for the first sentence
        bert_out1 = self.bert(input_ids1, attention_mask=attention_mask1).last_hidden_state
        lstm_out1, _ = self.bilstm(bert_out1)
        
        # Attention mechanism for the first sentence
        attn_weights1 = F.softmax(self.attention(lstm_out1), dim=1)
        attn_output1 = torch.sum(attn_weights1 * lstm_out1, dim=1)
        
        # BERT encoding for the second sentence
        bert_out2 = self.bert(input_ids2, attention_mask=attention_mask2).last_hidden_state
        lstm_out2, _ = self.bilstm(bert_out2)
        
        # Attention mechanism for the second sentence
        attn_weights2 = F.softmax(self.attention(lstm_out2), dim=1)
        attn_output2 = torch.sum(attn_weights2 * lstm_out2, dim=1)
        
        # Concatenate the attended outputs of both sentences
        combined = torch.cat((attn_output1, attn_output2), dim=1)
        
        # Pass through the fully connected layer
        output = self.fc(combined)
        output = self.sigmoid(output)
        
        return output

In [6]:
# Dataset class
class ParaphraseDataset(Dataset):
    def __init__(self, sentence_pairs, labels, tokenizer, max_len=128):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        sentence1, sentence2 = self.sentence_pairs[idx]
        
        # Tokenize and encode sentences
        encoding1 = self.tokenizer(sentence1, max_length=self.max_len, padding='max_length', 
                                   truncation=True, return_tensors='pt')
        encoding2 = self.tokenizer(sentence2, max_length=self.max_len, padding='max_length', 
                                   truncation=True, return_tensors='pt')

        input_ids1 = encoding1['input_ids'].squeeze(0)
        attention_mask1 = encoding1['attention_mask'].squeeze(0)
        input_ids2 = encoding2['input_ids'].squeeze(0)
        attention_mask2 = encoding2['attention_mask'].squeeze(0)

        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return input_ids1, attention_mask1, input_ids2, attention_mask2, label


In [7]:
# Extract sentence pairs and labels
sentence_pairs = list(zip(df['sentence1'], df['sentence2']))
labels = df['label'].tolist()

In [8]:
# Instantiate model, tokenizer, and data loader
tokenizer = BertTokenizer.from_pretrained('csebuetnlp/banglabert')
dataset = ParaphraseDataset(sentence_pairs, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [9]:
# Initialize the model, loss function, and optimizer
model = BERTBiLSTMAttention(bert_model_name='csebuetnlp/banglabert')
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.lay

In [10]:
from tqdm import tqdm

In [17]:
# # Set the device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Move the model to the selected device
# model = model.to(device)

In [14]:
# Training loop with tqdm for progress display
model.train()
for epoch in range(5):  # Adjust the number of epochs as needed
    epoch_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Wrap the dataloader with tqdm for progress tracking
    with tqdm(dataloader, desc=f'Epoch {epoch + 1}', unit='batch') as tepoch:
        for input_ids1, attention_mask1, input_ids2, attention_mask2, labels in tepoch:
            # Move data to the selected device
            input_ids1 = input_ids1.to(device)
            attention_mask1 = attention_mask1.to(device)
            input_ids2 = input_ids2.to(device)
            attention_mask2 = attention_mask2.to(device)
            labels = labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids1, attention_mask1, input_ids2, attention_mask2).squeeze()
            
            # Calculate loss
            loss = criterion(outputs, labels.float())

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Accumulate loss
            epoch_loss += loss.item()

            # Convert outputs to binary predictions using threshold 0.5
            predictions = (outputs > 0.5).float()

            # Calculate number of correct predictions
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.size(0)

            # Calculate running accuracy
            running_accuracy = correct_predictions / total_samples

            # Update tqdm description with current loss and accuracy
            tepoch.set_postfix(loss=loss.item(), accuracy=running_accuracy)

    # Average the loss over the number of batches
    avg_epoch_loss = epoch_loss / len(dataloader)
    avg_epoch_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1} finished with average loss: {avg_epoch_loss:.4f} and accuracy: {avg_epoch_accuracy:.4f}")

Epoch 1: 100%|██████████| 740/740 [04:43<00:00,  2.61batch/s, accuracy=0.619, loss=0.683]


Epoch 1 finished with average loss: 0.6658 and accuracy: 0.6192


Epoch 2: 100%|██████████| 740/740 [04:50<00:00,  2.55batch/s, accuracy=0.634, loss=0.533]


Epoch 2 finished with average loss: 0.6494 and accuracy: 0.6337


Epoch 3: 100%|██████████| 740/740 [04:50<00:00,  2.55batch/s, accuracy=0.757, loss=0.366]


Epoch 3 finished with average loss: 0.5235 and accuracy: 0.7565


Epoch 4: 100%|██████████| 740/740 [04:50<00:00,  2.55batch/s, accuracy=0.813, loss=0.441]


Epoch 4 finished with average loss: 0.4415 and accuracy: 0.8126


Epoch 5: 100%|██████████| 740/740 [04:50<00:00,  2.55batch/s, accuracy=0.849, loss=0.339] 

Epoch 5 finished with average loss: 0.3890 and accuracy: 0.8495





In [18]:
# Evaluation
model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for input_ids1, attention_mask1, input_ids2, attention_mask2, labels in dataloader:
        outputs = model(input_ids1, attention_mask1, input_ids2, attention_mask2).squeeze()
        predictions = (outputs > 0.5).float()
        y_true.extend(labels.numpy())
        y_pred.extend(predictions.numpy())


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [19]:
# Calculate accuracy and F1-score
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1:.2f}")

Accuracy: nan%
F1 Score: 0.00


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
