In [2]:
import os
import random
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, LongformerModel, LongformerConfig, LongformerTokenizer

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

class SlidingWindowTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, stride):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride

    def tokenize_and_chunk(self, text):
        tokens = self.tokenizer(text, return_tensors="pt", truncation=False)["input_ids"].squeeze()
        chunks = []
        for i in range(0, len(tokens), self.stride):
            chunk = tokens[i:i+self.max_length]
            if len(chunk) < self.max_length:
                chunk = torch.cat([chunk, torch.zeros(self.max_length - len(chunk), dtype=torch.long)])
            chunks.append(chunk)
        return torch.stack(chunks)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text1 = self.texts.iloc[idx]['source_content']
        text2 = self.texts.iloc[idx]['suspicious_content']
        label = self.labels[idx]
        
        chunks1 = self.tokenize_and_chunk(text1)
        chunks2 = self.tokenize_and_chunk(text2)
        return {
     'chunks1': torch.tensor(chunks1),
    'chunks2': torch.tensor(chunks2),
    'label': torch.tensor(label)}



from transformers import T5Tokenizer, T5EncoderModel

class DualAraT5SlidingWindow(nn.Module):
    def __init__(self, model_name, num_labels):
        super(DualAraT5SlidingWindow, self).__init__()
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.d_model
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(hidden_size * 2, 64)
        self.classifier = nn.Linear(64, num_labels)
        self.relu = nn.ReLU()
    
    def forward(self, chunks1, chunks2):
        embeddings1 = []
        embeddings2 = []
        
        # Process chunks for the first and second documents
        for chunk in chunks1:
            output = self.encoder(input_ids=chunk)["last_hidden_state"].mean(dim=1)
            embeddings1.append(output)
        
        for chunk in chunks2:
            output = self.encoder(input_ids=chunk)["last_hidden_state"].mean(dim=1)
            embeddings2.append(output)
        
        # Aggregate chunk embeddings
        doc_embedding1 = torch.mean(torch.stack(embeddings1), dim=0)
        doc_embedding2 = torch.mean(torch.stack(embeddings2), dim=0)
        
        # Concatenate and classify
        concatenated = torch.cat((doc_embedding1, doc_embedding2), dim=1)
        x = self.dense(concatenated)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits
    
from sklearn.metrics import classification_report, accuracy_score

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, map_label=None):
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        all_train_preds = []
        all_train_labels = []
        
        # Training loop
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            chunks1 = torch.stack(batch['chunks1']).to(device)
            chunks2 = torch.stack(batch['chunks2']).to(device)
            labels = torch.tensor(batch['label']).to(device)

            
            optimizer.zero_grad()
            
            outputs = model(chunks1, chunks2)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # Collect predictions and labels
            _, predicted = torch.max(torch.softmax(outputs, dim=1), 1)
            all_train_preds.extend(predicted.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())
        scheduler.step()
        
        # Print Training Metrics
        print(f'\nEpoch {epoch + 1}:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}')
        
        if map_label:
            train_report = classification_report(
                all_train_labels, all_train_preds, target_names=list(map_label.values())
            )
            print("Training Metrics:")
            print(train_report)
        
        # Validation loop
        model.eval()
        val_loss = 0
        all_val_preds = []
        all_val_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                chunks1 = batch['chunks1'].to(device)
                chunks2 = batch['chunks2'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(chunks1, chunks2)
                
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                # Collect predictions and labels
                _, predicted = torch.max(torch.softmax(outputs, dim=1), 1)
                all_val_preds.extend(predicted.cpu().numpy())
                all_val_labels.extend(labels.cpu().numpy())
        
        # Print Validation Metrics
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}')
        
        if map_label:
            val_report = classification_report(
                all_val_labels, all_val_preds, target_names=list(map_label.values())
            )
            print("Validation Metrics:")
            print(val_report)

        # Save best model
        if val_loss/len(val_loader) < best_val_loss:
            best_val_loss = val_loss/len(val_loader)
            torch.save(model.state_dict(), 'best_model.pt')
    
    return all_val_preds, all_val_labels

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black", fontsize=14)
    
    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)

def main():
    # Set constants
    MAX_SEQUENCE_LENGTH = 16000  # Maximum sequence length for AraT5
    STRIDE = 512  # Overlap between chunks
    MODEL_NAME = "UBC-NLP/AraT5v2-base-1024"  # AraT5 model
    BATCH_SIZE = 2
    NUM_EPOCHS = 10
    LEARNING_RATE = 2e-5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    set_seed(33)
    
    # Load and preprocess data
    df = pd.read_csv('FinalDatasetBalanced.csv')
    df['plagiarism_type'] = df['plagiarism_type'].factorize()[0]
    map_label = dict(enumerate(df['plagiarism_type'].factorize()[1]))
    
    X_train, X_test, y_train, y_test = train_test_split(
        df[['source_content', 'suspicious_content']], 
        df['plagiarism_type'].values,
        random_state=33,
        test_size=0.3
    )
    
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    
    # Create datasets with sliding window
    train_dataset = SlidingWindowTextDataset(X_train, y_train, tokenizer, MAX_SEQUENCE_LENGTH, STRIDE)
    test_dataset = SlidingWindowTextDataset(X_test, y_test, tokenizer, MAX_SEQUENCE_LENGTH, STRIDE)
    

    import torch.nn.functional as F

    def custom_collate_fn(batch):
        padded_chunks1 = [F.pad(item['chunks1'], (0, 16000 - item['chunks1'].size(0))) for item in batch]
        padded_chunks2 = [F.pad(item['chunks2'], (0, 16000 - item['chunks2'].size(0))) for item in batch]
        return {
        'chunks1': torch.stack(padded_chunks1),
        'chunks2': torch.stack(padded_chunks2),
        'label': torch.tensor([item['label'] for item in batch])}



    train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=32,
    collate_fn=custom_collate_fn)


    # train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: x)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=custom_collate_fn)
    
    model = DualAraT5SlidingWindow(MODEL_NAME, len(map_label)).to(DEVICE)
    
    class_weights = torch.tensor([0.1305, 0.1552, 0.1556, 0.5587]).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

    all_preds, all_labels = train_model(
        model, train_loader, test_loader, 
        criterion, optimizer, scheduler, NUM_EPOCHS, DEVICE, map_label
    )
    
    cnf_matrix = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(7, 7))
    plot_confusion_matrix(cnf_matrix, classes=list(map_label.values()))
    plt.show()

if __name__ == "__main__":
    main()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Epoch 1/10:   0%|          | 0/54 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3534 > 1024). Running this sequence through the model will result in indexing errors
  'chunks1': torch.tensor(chunks1),
  'chunks2': torch.tensor(chunks2),
Epoch 1/10:   0%|          | 0/54 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [7, 31993] at entry 0 and [4, 31996] at entry 1