In [1]:
import os
os.environ["KERAS_BACKEND"] = "jax"
import keras


In [2]:
import os
import random
import itertools
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import keras
from keras.layers import Input, GlobalAveragePooling1D, Concatenate, Dropout, Dense
from keras.optimizers import Adam
from keras.callbacks import Callback

from transformers import (
    AutoTokenizer,
    TFLongformerModel,
    LongformerConfig,
)

2024-11-14 14:50:28.418535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731588628.429055  489330 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731588628.432265  489330 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:


def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=25)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=15)
    plt.yticks(tick_marks, classes, fontsize=15)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(
            j, i, format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black", fontsize=14
        )

    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)


In [4]:

# Load and preprocess data
df = pd.read_csv('FinalDatasetBalanced.csv')
df['plagiarism_type'] = df['plagiarism_type'].factorize()[0]
map_label = dict(enumerate(df['plagiarism_type'].factorize()[1]))


In [5]:

def set_seed(seed):
    # Use keras.utils.set_random_seed for setting the seed
    keras.utils.set_random_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    
def convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=True):
    def return_id(str1, str2, length):
        inputs = tokenizer.encode_plus(
            str1, str2,
            add_special_tokens=True,
            max_length=length,
            truncation=True,
            return_token_type_ids=True
        )
        input_ids = inputs["input_ids"]
        input_masks = inputs["attention_mask"]
        input_segments = inputs["token_type_ids"]
        
        padding_length = length - len(input_ids)
        padding_id = tokenizer.pad_token_id
        
        input_ids = input_ids + ([padding_id] * padding_length)
        input_masks = input_masks + ([0] * padding_length)
        input_segments = input_segments + ([0] * padding_length)
        
        return [input_ids, input_masks, input_segments]

    if double:
        input_ids_1, input_masks_1, input_segments_1 = return_id(str1, None, max_sequence_length)
        input_ids_2, input_masks_2, input_segments_2 = return_id(str2, None, max_sequence_length)

        return [
            input_ids_1, input_masks_1, input_segments_1,
            input_ids_2, input_masks_2, input_segments_2
        ]
    else:
        input_ids, input_masks, input_segments = return_id(str1, str2, max_sequence_length)
        return [input_ids, input_masks, input_segments, None, None, None]

def compute_input_arrays(df, columns, tokenizer, max_sequence_length, double=True):
    input_ids_1, input_masks_1, input_segments_1 = [], [], []
    input_ids_2, input_masks_2, input_segments_2 = [], [], []
    
    for _, instance in df[columns].iterrows():
        str1, str2 = instance[columns[0]], instance[columns[1]]
        ids_1, masks_1, segments_1, ids_2, masks_2, segments_2 = \
            convert_to_transformer_inputs(str1, str2, tokenizer, max_sequence_length, double=double)
        
        input_ids_1.append(ids_1)
        input_masks_1.append(masks_1)
        input_segments_1.append(segments_1)
        input_ids_2.append(ids_2)
        input_masks_2.append(masks_2)
        input_segments_2.append(segments_2)

    if double:
        return [
            np.asarray(input_ids_1, dtype=np.int32), 
            np.asarray(input_masks_1, dtype=np.int32), 
            np.asarray(input_segments_1, dtype=np.int32),
            np.asarray(input_ids_2, dtype=np.int32), 
            np.asarray(input_masks_2, dtype=np.int32), 
            np.asarray(input_segments_2, dtype=np.int32)
        ]
    else:
        return [
            np.asarray(input_ids_1, dtype=np.int32), 
            np.asarray(input_masks_1, dtype=np.int32), 
            np.asarray(input_segments_1, dtype=np.int32)
        ]


In [6]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df[['source_content', 'suspicious_content']], df['plagiarism_type'].values, 
    random_state=33, test_size=0.3
)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# Import tokenizer
MAX_SEQUENCE_LENGTH = 16000
MODEL_NAME = "longformer-encdec-large-16384"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create sequences for train and test
input_train = compute_input_arrays(
    X_train, ['source_content', 'suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH
)
input_test = compute_input_arrays(
    X_test, ['source_content', 'suspicious_content'], tokenizer, MAX_SEQUENCE_LENGTH
)


(1702, 2) (730, 2)
(1702,) (730,)


In [7]:
def dual_longformer():
    set_seed(33)
    opt = Adam(learning_rate=2e-5)
    
    config = LongformerConfig.from_pretrained(MODEL_NAME)
    config.max_position_embeddings = 16000
    config.attention_window = [256] * config.num_hidden_layers

    class DualLongformerModel(keras.Model):
        def __init__(self, config, model_name, num_labels):
            super(DualLongformerModel, self).__init__()
            self.longformer_model1 = TFLongformerModel.from_pretrained(
                model_name, config=config, from_pt=True
            )
            self.longformer_model2 = TFLongformerModel.from_pretrained(
                model_name, config=config, from_pt=True
            )
            self.global_pool = GlobalAveragePooling1D()
            self.concat = Concatenate()
            self.dense1 = Dense(64, activation='relu')
            self.dropout = Dropout(0.2)
            self.classifier = Dense(num_labels, activation='softmax')

        def call(self, inputs):
            id1, mask1, atn1, id2, mask2, atn2 = inputs
            outputs1 = self.longformer_model1(
                input_ids=id1,
                attention_mask=mask1,
                token_type_ids=atn1,
                training=False
            )
            outputs2 = self.longformer_model2(
                input_ids=id2,
                attention_mask=mask2,
                token_type_ids=atn2,
                training=False
            )
            x1 = self.global_pool(outputs1.last_hidden_state)
            x2 = self.global_pool(outputs2.last_hidden_state)
            x = self.concat([x1, x2])
            x = self.dense1(x)
            x = self.dropout(x)
            out = self.classifier(x)
            return out

    model = DualLongformerModel(config, MODEL_NAME, num_labels=len(map_label))
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=opt,
        run_eagerly=True,    # Add this line
        jit_compile=False    # Add this line
    )
    return model


In [8]:


# Adjusted the callback to save the model in .keras format
class SaveModelCallback(Callback): 
    def __init__(self, save_path='longformer_checkpoints'):
        super().__init__()
        self.save_path = save_path

    def on_epoch_end(self, epoch, logs=None):
        epoch_path = f'{self.save_path}/epoch_{epoch+1}'
        os.makedirs(epoch_path, exist_ok=True)

        # Save the model in the new .keras format
        self.model.save(f'{epoch_path}/model_{epoch+1}.keras') 
        print(f'\nSaved model to {epoch_path}')

In [9]:

model = dual_longformer()
history = model.fit(
    x=input_train,
    y=y_train,
    epochs=3,
    batch_size=2,
    validation_data=(input_test, y_test),
    verbose=1,
    callbacks=[SaveModelCallback()]
)

You are using a model of type bart to instantiate a model of type longformer. This is not supported for all configurations of models and can yield errors.
2024-11-14 14:51:08.700763: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I0000 00:00:1731588668.700885  489330 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2538 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:2d:00.0, compute capability: 8.6
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFLongformerModel: ['model.decoder.layers.4.self_attn.q_proj.weight', 'model.encoder.layers.8.fc1.weight', 'model.decoder.layers.9.fc1.bias', 'model.decoder.layers.11.encoder_attn.k_proj.bias', 'model.encoder.layers.3.self_attn.longformer_self_attn.value.weight', 'model.decoder.layers.8.fc2.weight', 'model.enc

RuntimeError: Unable to automatically build the model. Please build it yourself before calling fit/evaluate/predict. A model is 'built' when its variables have been created and its `self.built` attribute is True. Usually, calling the model on a batch of data is the right way to build it.
Exception encountered:
'Exception encountered when calling layer 'tf_longformer_model' (type TFLongformerModel).

Data of type <class 'jax._src.interpreters.partial_eval.DynamicJaxprTracer'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_longformer_model' (type TFLongformerModel):
  • input_ids=Traced<ShapedArray(int32[2,16000])>with<DynamicJaxprTrace(level=1/0)>
  • attention_mask=Traced<ShapedArray(int32[2,16000])>with<DynamicJaxprTrace(level=1/0)>
  • head_mask=None
  • global_attention_mask=None
  • token_type_ids=Traced<ShapedArray(int32[2,16000])>with<DynamicJaxprTrace(level=1/0)>
  • position_ids=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False'

In [None]:



# Plot training history
plt.plot(history.history['loss'], label='Training Loss')
if 'val_loss' in history.history:
    plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Predict test data
pred_test = np.argmax(model.predict(input_test), axis=1)

print(classification_report(
    [map_label[i] for i in y_test], 
    [map_label[i] for i in pred_test]
))

cnf_matrix = confusion_matrix(
    [map_label[i] for i in y_test], 
    [map_label[i] for i in pred_test]
)

plt.figure(figsize=(7, 7))
plot_confusion_matrix(cnf_matrix, classes=list(map_label.values()))
plt.show()

NameError: name 'history' is not defined