In [None]:
import os
import random
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, LongformerConfig, LongformerModel



In [6]:
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black", fontsize = 14)

    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)


In [7]:

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

df = pd.read_csv('/content/FinalDatasetBalanced.csv')
df['plagiarism_type'], uniques = pd.factorize(df['plagiarism_type'])
map_label = dict(enumerate(uniques))



In [8]:
def convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=True):
    def return_id(str1, str2, length):
        inputs = tokenizer.encode_plus(
            str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation=True,
            padding='max_length',  # Let the tokenizer handle padding
            return_token_type_ids=True,
            return_attention_mask=True
        )
        input_ids = inputs["input_ids"]
        input_masks = inputs["attention_mask"]
        input_segments = inputs["token_type_ids"]
        return [input_ids, input_masks, input_segments]

    if double:
        input_ids_1, input_masks_1, input_segments_1 = return_id(str1, None, max_sequence_length)
        input_ids_2, input_masks_2, input_segments_2 = return_id(str2, None, max_sequence_length)

        return [input_ids_1, input_masks_1, input_segments_1,
                input_ids_2, input_masks_2, input_segments_2]
    else:
        input_ids, input_masks, input_segments = return_id(str1, str2, max_sequence_length)

        return [input_ids, input_masks, input_segments, None, None, None]



In [9]:
def compute_input_arrays(df, columns, tokenizer, max_sequence_length, double=True):
    input_ids_1, input_masks_1, input_segments_1 = [], [], []
    input_ids_2, input_masks_2, input_segments_2 = [], [], []

    for _, instance in df[columns].iterrows():
        str1, str2 = instance[columns[0]], instance[columns[1]]
        ids_1, masks_1, segments_1, ids_2, masks_2, segments_2 = \
            convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=double)

        input_ids_1.append(ids_1)
        input_masks_1.append(masks_1)
        input_segments_1.append(segments_1)
        input_ids_2.append(ids_2)
        input_masks_2.append(masks_2)
        input_segments_2.append(segments_2)

    if double:
        return [np.asarray(input_ids_1, dtype=np.int32),
                np.asarray(input_masks_1, dtype=np.int32),
                np.asarray(input_segments_1, dtype=np.int32),
                np.asarray(input_ids_2, dtype=np.int32),
                np.asarray(input_masks_2, dtype=np.int32),
                np.asarray(input_segments_2, dtype=np.int32)]
    else:
        return [np.asarray(input_ids_1, dtype=np.int32),
                np.asarray(input_masks_1, dtype=np.int32),
                np.asarray(input_segments_1, dtype=np.int32)]



In [18]:
### TRAIN TEST SPLIT ###

X_train, X_test, y_train, y_test = train_test_split(df[['source_content','suspicious_content']], df['plagiarism_type'].values,
                                                    random_state=33, test_size = 0.3)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)



(1702, 2) (730, 2)
(1702,) (730,)


In [12]:
!wget https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-16384.tar.gz

--2024-11-14 11:56:30--  https://ai2-s2-research.s3-us-west-2.amazonaws.com/longformer/longformer-encdec-large-16384.tar.gz
Resolving ai2-s2-research.s3-us-west-2.amazonaws.com (ai2-s2-research.s3-us-west-2.amazonaws.com)... 3.5.86.128, 52.92.176.210, 3.5.77.154, ...
Connecting to ai2-s2-research.s3-us-west-2.amazonaws.com (ai2-s2-research.s3-us-west-2.amazonaws.com)|3.5.86.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 995275976 (949M) [application/x-tar]
Saving to: ‘longformer-encdec-large-16384.tar.gz’


2024-11-14 11:57:04 (28.2 MB/s) - ‘longformer-encdec-large-16384.tar.gz’ saved [995275976/995275976]



In [14]:
import tarfile

# Extract the archive
with tarfile.open('/content/longformer-encdec-large-16384.tar.gz', 'r:gz') as tar:
    tar.extractall('/content/longformer_model')  # Specify the path where to extract


In [25]:
### IMPORT TOKENIZER ###

MAX_SEQUENCE_LENGTH = 16000
MODEL_NAME = "/content/longformer_model/longformer-encdec-large-16384"  # Adjusted to a valid model name

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### CREATE SEQUENCES (id, mask, segments) FOR TRAIN AND TEST ###

input_train = compute_input_arrays(X_train,['source_content','suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH)
input_test = compute_input_arrays(X_test, ['source_content','suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH)





In [28]:
class PlagiarismDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs[0])

    def __getitem__(self, idx):
        item = {
            'input_ids1': torch.tensor(self.input_ids1[idx], dtype=torch.long),
            'attention_mask1': torch.tensor(self.attention_mask1[idx], dtype=torch.long),
            'token_type_ids1': torch.tensor(self.token_type_ids1[idx], dtype=torch.long),
            'input_ids2': torch.tensor(self.input_ids2[idx], dtype=torch.long),
            'attention_mask2': torch.tensor(self.attention_mask2[idx], dtype=torch.long),
            'token_type_ids2': torch.tensor(self.token_type_ids2[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

In [29]:
train_dataset = PlagiarismDataset(input_train, y_train)
test_dataset = PlagiarismDataset(input_test, y_test)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=False)



In [9]:
class DualLongformer(nn.Module):
    def __init__(self, num_labels):
        super(DualLongformer, self).__init__()
        set_seed(33)
        self.config = LongformerConfig.from_pretrained(MODEL_NAME,ignore_mismatched_sizes=True)
        self.config.max_position_embeddings = MAX_SEQUENCE_LENGTH
        self.config.attention_window = [256] * self.config.num_hidden_layers

        self.longformer_model1 = LongformerModel.from_pretrained(MODEL_NAME, config=self.config,ignore_mismatched_sizes=True)
        self.longformer_model2 = LongformerModel.from_pretrained(MODEL_NAME, config=self.config,ignore_mismatched_sizes=True)

        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.dense = nn.Linear(2 * self.config.hidden_size, 64)
        self.classifier = nn.Linear(64, len(map_label))

    def forward(self, id1, mask1, atn1, id2, mask2, atn2):
        embedding1 = self.longformer_model1(input_ids=id1, attention_mask=mask1, token_type_ids=atn1)[0]
        embedding2 = self.longformer_model2(input_ids=id2, attention_mask=mask2, token_type_ids=atn2)[0]

        x1 = torch.mean(embedding1, dim=1)  # GlobalAveragePooling1D
        x2 = torch.mean(embedding2, dim=1)  # GlobalAveragePooling1D

        x = torch.cat((x1, x2), dim=1)
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits



In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DualLongformer(len(map_label))
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



Some weights of LongformerModel were not initialized from the model checkpoint at allenai/longformer-large-4096 and are newly initialized because the shapes did not match:
- longformer.embeddings.position_embeddings.weight: found shape torch.Size([4098, 1024]) in the checkpoint and torch.Size([16000, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of LongformerModel were not initialized from the model checkpoint at allenai/longformer-large-4096 and are newly initialized because the shapes did not match:
- longformer.embeddings.position_embeddings.weight: found shape torch.Size([4098, 1024]) in the checkpoint and torch.Size([16000, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from tqdm import tqdm

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc='Training'):
        optimizer.zero_grad()
        id1 = batch['id1'].to(device)
        mask1 = batch['mask1'].to(device)
        atn1 = batch['atn1'].to(device)
        id2 = batch['id2'].to(device)
        mask2 = batch['mask2'].to(device)
        atn2 = batch['atn2'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(id1, mask1, atn1, id2, mask2, atn2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    return avg_loss



In [12]:
def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            id1 = batch['id1'].to(device)
            mask1 = batch['mask1'].to(device)
            atn1 = batch['atn1'].to(device)
            id2 = batch['id2'].to(device)
            mask2 = batch['mask2'].to(device)
            atn2 = batch['atn2'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(id1, mask1, atn1, id2, mask2, atn2)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(loader)
    return avg_loss, preds, true_labels



In [14]:
num_epochs = 3
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_preds, val_labels = eval_epoch(model, test_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}')

    # Save the model
    save_path = f'longformer_checkpoints/epoch_{epoch+1}'
    os.makedirs(save_path, exist_ok=True)
    model_to_save = model.module if hasattr(model, 'module') else model
    torch.save(model_to_save.state_dict(), os.path.join(save_path, 'pytorch_model.bin'))
    print(f'Saved model to {save_path}')



Epoch 1/3


Training:   0%|          | 0/851 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 191.06 MiB is free. Process 46480 has 14.56 GiB memory in use. Of the allocated memory 13.75 GiB is allocated by PyTorch, and 703.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Plot training history
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

### PREDICT TEST ###

from sklearn.metrics import classification_report, confusion_matrix

pred_test = val_preds
true_test = val_labels

class_names = list(map_label.values())

true_class_names = [map_label[i] for i in true_test]
pred_class_names = [map_label[i] for i in pred_test]

print(classification_report(true_class_names, pred_class_names))

cnf_matrix = confusion_matrix(true_class_names, pred_class_names)

plt.figure(figsize=(7,7))
plot_confusion_matrix(cnf_matrix, classes=class_names)
plt.show()


In [20]:
import torch
import torch.nn as nn
from transformers import LongformerModel, LongformerConfig
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Set random seeds for reproducibility
torch.manual_seed(33)
np.random.seed(33)



In [22]:
# Define your model name and other constants
# MODEL_NAME = 'allenai/longformer-base-4096'  # Replace with your model name
# MAX_SEQUENCE_LENGTH = 512  # Replace with your sequence length
map_label = {0: 'classsource_content0', 1: 'suspicious_content'}  # Replace with your label mapping
# Define the Siamese Dataset
class SiameseDataset(Dataset):
    def __init__(self, input_ids1, attention_mask1, token_type_ids1,
                       input_ids2, attention_mask2, token_type_ids2,
                       labels):
        self.input_ids1 = input_ids1
        self.attention_mask1 = attention_mask1
        self.token_type_ids1 = token_type_ids1
        self.input_ids2 = input_ids2
        self.attention_mask2 = attention_mask2
        self.token_type_ids2 = token_type_ids2
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids1': torch.tensor(self.input_ids1[idx], dtype=torch.long),
            'attention_mask1': torch.tensor(self.attention_mask1[idx], dtype=torch.long),
            'token_type_ids1': torch.tensor(self.token_type_ids1[idx], dtype=torch.long),
            'input_ids2': torch.tensor(self.input_ids2[idx], dtype=torch.long),
            'attention_mask2': torch.tensor(self.attention_mask2[idx], dtype=torch.long),
            'token_type_ids2': torch.tensor(self.token_type_ids2[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Define the Siamese Longformer model
class SiameseLongformer(nn.Module):
    def __init__(self, model_name, num_labels):
        super(SiameseLongformer, self).__init__()
        config = LongformerConfig.from_pretrained(model_name)
        config.output_hidden_states = False
        self.longformer = LongformerModel.from_pretrained(model_name, config=config)
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Global average pooling
        self.fc1 = nn.Linear(self.longformer.config.hidden_size * 2, 64)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(64, num_labels)

    def forward(self, input_ids1, attention_mask1, token_type_ids1,
                      input_ids2, attention_mask2, token_type_ids2):
        output1 = self.longformer(input_ids=input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1)
        output2 = self.longformer(input_ids=input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2)

        # Apply global average pooling
        x1 = output1.last_hidden_state.permute(0, 2, 1)
        x1 = self.pooling(x1).squeeze(2)

        x2 = output2.last_hidden_state.permute(0, 2, 1)
        x2 = self.pooling(x2).squeeze(2)

        # Concatenate and pass through fully connected layers
        x = torch.cat([x1, x2], dim=1)
        x = nn.functional.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)

        return logits



In [31]:
# Instantiate the model
num_labels = len(map_label)
model = SiameseLongformer(MODEL_NAME, num_labels)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Prepare your data (replace with your actual data)
# Assuming input_train and input_test are lists of numpy arrays as per the original code
# Each element corresponds to id1, mask1, atn1, id2, mask2, atn2 respectively







# Create datasets and dataloaders
train_dataset = SiameseDataset(input_train, y_train)
test_dataset = SiameseDataset(input_test, y_test)
batch_size = 6
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training loop
num_epochs = 3
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids1 = batch['input_ids1'].to(device)
        attention_mask1 = batch['attention_mask1'].to(device)
        token_type_ids1 = batch['token_type_ids1'].to(device)
        input_ids2 = batch['input_ids2'].to(device)
        attention_mask2 = batch['attention_mask2'].to(device)
        token_type_ids2 = batch['token_type_ids2'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids1, attention_mask1, token_type_ids1,
                        input_ids2, attention_mask2, token_type_ids2)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            token_type_ids1 = batch['token_type_ids1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            token_type_ids2 = batch['token_type_ids2'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids1, attention_mask1, token_type_ids1,
                            input_ids2, attention_mask2, token_type_ids2)

            loss = criterion(outputs, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(test_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

# Plot training history
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Evaluate on test data
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids1 = batch['input_ids1'].to(device)
        attention_mask1 = batch['attention_mask1'].to(device)
        token_type_ids1 = batch['token_type_ids1'].to(device)
        input_ids2 = batch['input_ids2'].to(device)
        attention_mask2 = batch['attention_mask2'].to(device)
        token_type_ids2 = batch['token_type_ids2'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids1, attention_mask1, token_type_ids1,
                        input_ids2, attention_mask2, token_type_ids2)

        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print classification report
all_pred_labels = [map_label[i] for i in all_preds]
all_true_labels = [map_label[i] for i in all_labels]
print(classification_report(all_true_labels, all_pred_labels))

# Plot confusion matrix
cnf_matrix = confusion_matrix(all_true_labels, all_pred_labels)
plt.figure(figsize=(7,7))
sns.heatmap(cnf_matrix, annot=True, fmt='d', xticklabels=list(map_label.values()), yticklabels=list(map_label.values()))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


You are using a model of type bart to instantiate a model of type longformer. This is not supported for all configurations of models and can yield errors.
Some weights of LongformerModel were not initialized from the model checkpoint at /content/longformer_model/longformer-encdec-large-16384 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key_global.bias', 'encoder.layer.0.attention.self.key_global.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0

TypeError: SiameseDataset.__init__() missing 5 required positional arguments: 'token_type_ids1', 'input_ids2', 'attention_mask2', 'token_type_ids2', and 'labels'

In [1]:
import os
import random
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, LongformerModel, LongformerConfig

2024-11-14 15:12:51.157884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731589971.168238  495751 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731589971.171445  495751 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-14 15:12:51.183211: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

# Function to plot confusion matrix (remains the same)
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    # plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
            fontsize=14,
        )

    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)


In [3]:
# Set random seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)


set_seed(33)

# Load and preprocess the dataset
df = pd.read_csv('FinalDatasetBalanced.csv')
df['plagiarism_type'], uniques = pd.factorize(df['plagiarism_type'])
map_label = dict(enumerate(uniques))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df[['source_content', 'suspicious_content']],
    df['plagiarism_type'].values,
    random_state=33,
    test_size=0.3,
)

# Create DataFrames for train and test
train_df = X_train.copy()
train_df['plagiarism_type'] = y_train

test_df = X_test.copy()
test_df['plagiarism_type'] = y_test

# Define constants
MAX_SEQUENCE_LENGTH = 16000  # Adjust based on your GPU memory
# MODEL_NAME = "longformer-encdec-large-16384"
MODEL_NAME = "allenai/longformer-base-4096"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [4]:
df["plagiarism_type"].value_counts()

plagiarism_type
1    835
2    702
0    700
3    195
Name: count, dtype: int64

In [5]:
# Define the Dataset class
class PlagiarismDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.source_texts = df['source_content'].values
        self.suspicious_texts = df['suspicious_content'].values
        self.labels = df['plagiarism_type'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        source_text = str(self.source_texts[idx])
        suspicious_text = str(self.suspicious_texts[idx])
        label = self.labels[idx]

        # Tokenize source_text
        encoding1 = self.tokenizer.encode_plus(
            source_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        # Tokenize suspicious_text
        encoding2 = self.tokenizer.encode_plus(
            suspicious_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        item = {
            'input_ids1': encoding1['input_ids'].squeeze(),
            'attention_mask1': encoding1['attention_mask'].squeeze(),
            'input_ids2': encoding2['input_ids'].squeeze(),
            'attention_mask2': encoding2['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long),
        }
        return item


In [6]:
# Create datasets
train_dataset = PlagiarismDataset(
    train_df.reset_index(drop=True), tokenizer, MAX_SEQUENCE_LENGTH
)
test_dataset = PlagiarismDataset(
    test_df.reset_index(drop=True), tokenizer, MAX_SEQUENCE_LENGTH
)

# Create DataLoaders
batch_size = 6
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [7]:
# Define the model
class SiameseLongformer(nn.Module):
    def __init__(self, model_name, num_labels):
        super(SiameseLongformer, self).__init__()
        self.config = LongformerConfig.from_pretrained(model_name)
        # self.config.output_hidden_states = False
        self.config.max_position_embeddings = MAX_SEQUENCE_LENGTH
        self.config.attention_window = [512] * self.config.num_hidden_layers
        
        self.longformer = LongformerModel.from_pretrained(
            model_name, config=self.config
        )
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(2 * self.config.hidden_size, 64)
        self.classifier = nn.Linear(64, num_labels)

    def forward(
        self, input_ids1, attention_mask1, input_ids2, attention_mask2
    ):
        outputs1 = self.longformer(
            input_ids=input_ids1, attention_mask=attention_mask1
        )
        outputs2 = self.longformer(
            input_ids=input_ids2, attention_mask=attention_mask2
        )
        # Take the mean over the sequence length
        x1 = torch.mean(outputs1.last_hidden_state, dim=1)
        x2 = torch.mean(outputs2.last_hidden_state, dim=1)
        x = torch.cat((x1, x2), dim=1)  # Concatenate along the feature dimension
        x = torch.relu(self.dense(x))
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits


In [8]:
map_label

{0: 'No Plagiarism',
 1: 'Artificial Obfuscation',
 2: 'No Obfuscation',
 3: 'Simulated Obfuscation'}

In [9]:
df

Unnamed: 0,source_content,suspicious_content,plagiarism_type
0,بين عالم يعمل على أن يجعل الحياة أكثر رفاهية، ...,إن الاختلاج هو الحالة التي يحدث فيها تفريغ فجا...,0
1,لماذا نحتاج لطرح صيغة جديدة من صيغ الإجابة على...,اً كانوا يؤدونه للنبي صلى الله عليه وسلم لقاتل...,0
2,* اختارتني الأمم المتحدة سفيرا لانحيازي للبسطا...,"المخرجة الفلسطينية ""مي المصري"" في مجمل أعمالها...",0
3,"الحياة محطات.\nوحياتي تزخر بمحطات كثيرة, في ال...","وة إلى ""أمة الإسلام""، وكان في دعوته يميل إلى ا...",0
4,لمحات من سيرة عذبة لواحد من عشاق التراث العربي...,ن الإسلام.. يقدمه - عن فهم وعقيدة - على أنه نظ...,0
...,...,...,...
2427,أواجه على هذه الصفحات واحدة من أصعب المهام: أو...,خير لسان للعروبة والإسلام\nذكرى العقاد مازالت ...,3
2428,"كنا جميعا نتوجه إلى مكة, ولكنها لا تتبدى لنا ب...",منذ أكثر من عامين قامت الولايات المتحدة بإطلاق...,3
2429,عاشت فرنسا منذ نهاية الصيف حملة كبيرة وتجاذبا ...,الحديث عن الأسرة هو حديث عن شبكة من العلاقات و...,3
2430,في القرن التاسع عشر بدأت في الولايات المتحدة ا...,هكذا صرخ كريستوفر كولمبس مكتشف أمريكا الشهير و...,3


In [10]:
# Instantiate the model
num_labels = len(set(df['plagiarism_type']))
model = SiameseLongformer(MODEL_NAME, num_labels)

# Move model to device
device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids1 = batch['input_ids1'].to(device)
        attention_mask1 = batch['attention_mask1'].to(device)
        input_ids2 = batch['input_ids2'].to(device)
        attention_mask2 = batch['attention_mask2'].to(device)
        labels = batch['labels'].to(device)

        logits = model(
            input_ids1, attention_mask1, input_ids2, attention_mask2
        )

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}, Training loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_eval_loss = 0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)

            logits = model(
                input_ids1, attention_mask1, input_ids2, attention_mask2
            )
            loss = criterion(logits, labels)
            total_eval_loss += loss.item()

            pred = torch.argmax(logits, dim=1)
            preds.extend(pred.detach().cpu().numpy())
            true_labels.extend(labels.detach().cpu().numpy())
    avg_val_loss = total_eval_loss / len(test_loader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}, Validation loss: {avg_val_loss}")

    # Compute classification metrics
    print(
        classification_report(
            [map_label[i] for i in true_labels],
            [map_label[i] for i in preds],
        )
    )

# Plot training history
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Compute confusion matrix
cnf_matrix = confusion_matrix(
    [map_label[i] for i in true_labels], [map_label[i] for i in preds]
)

plt.figure(figsize=(7, 7))
plot_confusion_matrix(cnf_matrix, classes=list(map_label.values()))
plt.show()


RuntimeError: Error(s) in loading state_dict for LongformerModel:
	size mismatch for longformer.embeddings.position_embeddings.weight: copying a param with shape torch.Size([4098, 768]) from checkpoint, the shape in current model is torch.Size([16000, 768]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]