In [1]:
import os
import random
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, LongformerConfig, LongformerModel



2024-11-14 15:06:33.992359: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731589594.002527  494758 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731589594.005686  494758 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-14 15:06:34.016966: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# get index of currently selected device
torch.cuda.current_device() # returns 0 in my case


0

In [3]:
# get number of GPUs available
torch.cuda.device_count()

1

In [4]:
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black", fontsize = 14)

    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)


In [5]:

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

df = pd.read_csv('FinalDatasetBalanced.csv')
df['plagiarism_type'], uniques = pd.factorize(df['plagiarism_type'])
map_label = dict(enumerate(uniques))



In [6]:
def convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=True):
    def return_id(str1, str2, length):
        inputs = tokenizer.encode_plus(
            str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation=True,
            padding='max_length',  # Let the tokenizer handle padding
            return_token_type_ids=True,
            return_attention_mask=True
        )
        input_ids = inputs["input_ids"]
        input_masks = inputs["attention_mask"]
        input_segments = inputs["token_type_ids"]
        return [input_ids, input_masks, input_segments]

    if double:
        input_ids_1, input_masks_1, input_segments_1 = return_id(str1, None, max_sequence_length)
        input_ids_2, input_masks_2, input_segments_2 = return_id(str2, None, max_sequence_length)

        return [input_ids_1, input_masks_1, input_segments_1,
                input_ids_2, input_masks_2, input_segments_2]
    else:
        input_ids, input_masks, input_segments = return_id(str1, str2, max_sequence_length)

        return [input_ids, input_masks, input_segments, None, None, None]



In [7]:
def compute_input_arrays(df, columns, tokenizer, max_sequence_length, double=True):
    input_ids_1, input_masks_1, input_segments_1 = [], [], []
    input_ids_2, input_masks_2, input_segments_2 = [], [], []

    for _, instance in df[columns].iterrows():
        str1, str2 = instance[columns[0]], instance[columns[1]]
        ids_1, masks_1, segments_1, ids_2, masks_2, segments_2 = \
            convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=double)

        input_ids_1.append(ids_1)
        input_masks_1.append(masks_1)
        input_segments_1.append(segments_1)
        input_ids_2.append(ids_2)
        input_masks_2.append(masks_2)
        input_segments_2.append(segments_2)

    if double:
        return [np.asarray(input_ids_1, dtype=np.int32),
                np.asarray(input_masks_1, dtype=np.int32),
                np.asarray(input_segments_1, dtype=np.int32),
                np.asarray(input_ids_2, dtype=np.int32),
                np.asarray(input_masks_2, dtype=np.int32),
                np.asarray(input_segments_2, dtype=np.int32)]
    else:
        return [np.asarray(input_ids_1, dtype=np.int32),
                np.asarray(input_masks_1, dtype=np.int32),
                np.asarray(input_segments_1, dtype=np.int32)]



In [8]:
### TRAIN TEST SPLIT ###

X_train, X_test, y_train, y_test = train_test_split(df[['source_content','suspicious_content']], df['plagiarism_type'].values,
                                                    random_state=33, test_size = 0.3)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)



(1702, 2) (730, 2)
(1702,) (730,)


In [9]:
### IMPORT TOKENIZER ###

MAX_SEQUENCE_LENGTH = 16000
MODEL_NAME = "longformer-encdec-large-16384"  # Adjusted to a valid model name

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### CREATE SEQUENCES (id, mask, segments) FOR TRAIN AND TEST ###

input_train = compute_input_arrays(X_train,['source_content','suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH)
input_test = compute_input_arrays(X_test, ['source_content','suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH)

class PlagiarismDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs[0])

    def __getitem__(self, idx):
        item = {
            'id1': torch.tensor(self.inputs[0][idx], dtype=torch.long),
            'mask1': torch.tensor(self.inputs[1][idx], dtype=torch.long),
            'atn1': torch.tensor(self.inputs[2][idx], dtype=torch.long),
            'id2': torch.tensor(self.inputs[3][idx], dtype=torch.long),
            'mask2': torch.tensor(self.inputs[4][idx], dtype=torch.long),
            'atn2': torch.tensor(self.inputs[5][idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item



In [10]:
train_dataset = PlagiarismDataset(input_train, y_train)
test_dataset = PlagiarismDataset(input_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=False)



In [None]:
# class DualLongformer(nn.Module):
#     def __init__(self, num_labels):
#         super(DualLongformer, self).__init__()
#         set_seed(33)
#         self.config = LongformerConfig.from_pretrained(MODEL_NAME,ignore_mismatched_sizes=True)
#         self.config.max_position_embeddings = MAX_SEQUENCE_LENGTH
#         self.config.attention_window = [512] * self.config.num_hidden_layers

#         self.longformer_model1 = LongformerModel.from_pretrained(MODEL_NAME, config=self.config,ignore_mismatched_sizes=True)
#         self.longformer_model2 = LongformerModel.from_pretrained(MODEL_NAME, config=self.config,ignore_mismatched_sizes=True)

#         self.dropout = nn.Dropout(0.2)
#         self.relu = nn.ReLU()
#         self.dense = nn.Linear(2 * self.config.hidden_size, 64)
#         self.classifier = nn.Linear(64, len(map_label))

#     def forward(self, id1, mask1, atn1, id2, mask2, atn2):
#         embedding1 = self.longformer_model1(input_ids=id1, attention_mask=mask1, token_type_ids=atn1)[0]
#         embedding2 = self.longformer_model2(input_ids=id2, attention_mask=mask2, token_type_ids=atn2)[0]

#         x1 = torch.mean(embedding1, dim=1)  # GlobalAveragePooling1D
#         x2 = torch.mean(embedding2, dim=1)  # GlobalAveragePooling1D

#         x = torch.cat((x1, x2), dim=1)
#         x = self.dense(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         logits = self.classifier(x)
#         return logits



In [13]:
# Define the model
class SiameseLongformer(nn.Module):
    def __init__(self, num_labels):
        super(SiameseLongformer, self).__init__()
        self.config = LongformerConfig.from_pretrained(MODEL_NAME)
        # self.config.output_hidden_states = False
        self.config.max_position_embeddings = MAX_SEQUENCE_LENGTH
        self.config.attention_window = [512] * self.config.num_hidden_layers
        
        self.longformer = LongformerModel.from_pretrained(
            MODEL_NAME, config=self.config
        )
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(2 * self.config.hidden_size, 64)
        self.classifier = nn.Linear(64, num_labels)

    def forward(
        self, input_ids1, attention_mask1, input_ids2, attention_mask2
    ):
        outputs1 = self.longformer(
            input_ids=input_ids1, attention_mask=attention_mask1
        )
        outputs2 = self.longformer(
            input_ids=input_ids2, attention_mask=attention_mask2
        )
        # Take the mean over the sequence length
        x1 = torch.mean(outputs1.last_hidden_state, dim=1)
        x2 = torch.mean(outputs2.last_hidden_state, dim=1)
        x = torch.cat((x1, x2), dim=1)  # Concatenate along the feature dimension
        x = torch.relu(self.dense(x))
        x = self.dropout(x)
        logits = self.classifier(x)
        return logits


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SiameseLongformer(len(map_label))
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



You are using a model of type bart to instantiate a model of type longformer. This is not supported for all configurations of models and can yield errors.
Some weights of LongformerModel were not initialized from the model checkpoint at longformer-encdec-large-16384 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key_global.bias', 'encoder.layer.0.attention.self.key_global.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query_glob

In [15]:
from tqdm import tqdm

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc='Training'):
        optimizer.zero_grad()
        id1 = batch['id1'].to(device)
        mask1 = batch['mask1'].to(device)
        atn1 = batch['atn1'].to(device)
        id2 = batch['id2'].to(device)
        mask2 = batch['mask2'].to(device)
        atn2 = batch['atn2'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(id1, mask1, atn1, id2, mask2, atn2)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(loader)
    return avg_loss



In [16]:
def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            id1 = batch['id1'].to(device)
            mask1 = batch['mask1'].to(device)
            atn1 = batch['atn1'].to(device)
            id2 = batch['id2'].to(device)
            mask2 = batch['mask2'].to(device)
            atn2 = batch['atn2'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(id1, mask1, atn1, id2, mask2, atn2)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(loader)
    return avg_loss, preds, true_labels



In [17]:
num_epochs = 3
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_preds, val_labels = eval_epoch(model, test_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Val Loss: {val_loss:.4f}')

    # Save the model
    save_path = f'longformer_checkpoints/epoch_{epoch+1}'
    os.makedirs(save_path, exist_ok=True)
    model_to_save = model.module if hasattr(model, 'module') else model
    torch.save(model_to_save.state_dict(), os.path.join(save_path, 'pytorch_model.bin'))
    print(f'Saved model to {save_path}')



Epoch 1/3


Training:   0%|          | 0/851 [00:00<?, ?it/s]


TypeError: SiameseLongformer.forward() takes 5 positional arguments but 7 were given

In [None]:
# Plot training history
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

### PREDICT TEST ###

from sklearn.metrics import classification_report, confusion_matrix

pred_test = val_preds
true_test = val_labels

class_names = list(map_label.values())

true_class_names = [map_label[i] for i in true_test]
pred_class_names = [map_label[i] for i in pred_test]

print(classification_report(true_class_names, pred_class_names))

cnf_matrix = confusion_matrix(true_class_names, pred_class_names)

plt.figure(figsize=(7,7))
plot_confusion_matrix(cnf_matrix, classes=class_names)
plt.show()
