In [None]:
from tensorflow.keras.layers import MultiHeadAttention, Input, Dense
from tensorflow.keras.layers import LayerNormalization, Layer
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D
from tensorflow.data import Dataset
from tensorflow import convert_to_tensor, string, float32, shape, range, reshape
from tensorflow.keras import utils
from tensorflow.keras import Model, Sequential
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
df=pd.read_csv('/content/combined_df.csv')
df['Is_Equal']=df['Is_Equal'].astype(int)
# df.drop('Unnamed: 0',axis=1,inplace=True)
df.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, MultiHeadAttention, Dense, Flatten, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import LearningRateScheduler
import numpy as np
from tensorflow.keras.callbacks import ReduceLROnPlateau


#Set Seed
tf.random.set_seed(42)
np.random.seed(42)

#convert is equal to binary labels
df['Is_Equal']=df['Is_Equal'].astype(int)
# Split data (400 train, 80 test)
train_dataset = df.iloc[:400, :]
test_dataset = df.iloc[400:, :]

# Convert labels
train_Y = utils.to_categorical(train_dataset['Is_Equal'], num_classes=2)
test_Y = utils.to_categorical(test_dataset['Is_Equal'], num_classes=2)

# Create and configure vectorizers
original_code_vectorizer = TextVectorization(
    max_tokens=300,

    output_sequence_length=150
)
java_code_vectorizer = TextVectorization(
    max_tokens=300,
    output_sequence_length=150
)

# Adapt vectorizers
original_code_vectorizer.adapt(train_dataset['original_code'])
java_code_vectorizer.adapt(train_dataset['java_translation'])

# Vectorize data
train_X_original = original_code_vectorizer(train_dataset['original_code'])
train_X_java = java_code_vectorizer(train_dataset['java_translation'])
test_X_original = original_code_vectorizer(test_dataset['original_code'])
test_X_java = java_code_vectorizer(test_dataset['java_translation'])



# Create embedding layers
original_code_embedding = Embedding(
    input_dim=original_code_vectorizer.vocabulary_size(),
    output_dim=32,
    embeddings_regularizer=tf.keras.regularizers.l2(0.0001)
)

java_code_embedding = Embedding(
    input_dim=java_code_vectorizer.vocabulary_size(),
    output_dim=32,
    embeddings_regularizer=tf.keras.regularizers.l2(0.0001)
)

# Embed sequences
train_X_original_embedded = original_code_embedding(train_X_original)
train_X_java_embedded = java_code_embedding(train_X_java)
test_X_original_embedded = original_code_embedding(test_X_original)
test_X_java_embedded = java_code_embedding(test_X_java)

# Concatenate embeddings
train_X_embedded = tf.concat([train_X_original_embedded, train_X_java_embedded], axis=1)
test_X_embedded = tf.concat([test_X_original_embedded, test_X_java_embedded], axis=1)


# Model architecture
def create_model(input_shape):
    inputs = Input(shape=input_shape)


    attention = MultiHeadAttention(num_heads=2, key_dim=32)(inputs, inputs)
    x = LayerNormalization()(attention + inputs)
    x = Flatten()(x)
    x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(x)
    x = LayerNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001))(x)
    x = LayerNormalization()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(2, activation='softmax')(x)

    return Model(inputs=inputs, outputs=outputs)

model = create_model(train_X_embedded.shape[1:])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)



reduce_lr = ReduceLROnPlateau(
    factor=0.7,
    patience=3,
    min_lr=1e-6,
    moniter='val_accuracy'
)

# Training
history = model.fit(
    train_X_embedded,
    train_Y,
    epochs=30,
    batch_size=64,
    validation_split=0.1,
    callbacks=[reduce_lr],
    shuffle=True
)

In [None]:
#Print confusion matrix
y_pred = model.predict(test_X_embedded)
cm = confusion_matrix(test_Y.argmax(axis=1), y_pred.argmax(axis=1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Equal', 'Equal'])
disp.plot(cmap=plt.cm.Blues)


In [None]:
import pickle
pickle.dump(model, open('Transformermodel.pkl', 'wb'))

In [None]:
pickle.dump(original_code_vectorizer, open('py_vectorizer.pkl', 'wb'))
pickle.dump(original_code_vectorizer, open('java_vectorizer.pkl', 'wb'))