In [1]:
import pandas as pd
import pickle as pk

In [2]:
!pip install transformers



In [4]:
import os
import pandas as pd
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

df = pd.read_csv('/kaggle/input/combined-df-csv/combined_df.csv')

tokenizer = RobertaTokenizer.from_pretrained('/kaggle/input/codesynapsedataset/new_codebert_tokenizer/new_codebert_tokenizer')
model = TFRobertaModel.from_pretrained('/kaggle/input/codesynapsedataset/new_codebert_model/new_codebert_model')

def generate_embeddings(code_list):
    embeddings = []
    for snippet in code_list:
        inputs = tokenizer(snippet, return_tensors="tf", padding="max_length", truncation=True, max_length=512)
        outputs = model(**inputs)
        embedding = tf.reduce_mean(outputs.last_hidden_state, axis=1)
        embeddings.append(embedding.numpy())  
    return embeddings  

if 'original_code' not in df.columns or 'java_translation' not in df.columns:
    raise ValueError("Ensure 'original_code' and 'java_translation' columns exist in the dataset!")

original_code_embeddings = generate_embeddings(df['original_code'].tolist())
java_code_embeddings = generate_embeddings(df['java_translation'].tolist())

df['original_code_embeddings'] = [embedding.flatten().tolist() for embedding in original_code_embeddings]
df['java_code_embeddings'] = [embedding.flatten().tolist() for embedding in java_code_embeddings]

embedding_size = len(original_code_embeddings[0]) 
for i in range(embedding_size):
    df[f'original_code_emb_{i}'] = [embedding[i] for embedding in original_code_embeddings]
    df[f'java_code_emb_{i}'] = [embedding[i] for embedding in java_code_embeddings]

df.drop(columns=['original_code_embeddings', 'java_code_embeddings'], inplace=True)

output_directory = "./data" 

os.makedirs(output_directory, exist_ok=True)

csv_path = os.path.join(output_directory, "code_embeddings_flat1.csv")
pkl_path = os.path.join(output_directory, "code_embeddings_flat1.pkl")

df.to_csv(csv_path, index=False) 
df.to_pickle(pkl_path)  

print(f"Embeddings generated, flattened, and saved to {csv_path} and {pkl_path}!") 

All PyTorch model weights were used when initializing TFRobertaModel.

All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Embeddings generated, flattened, and saved to ./data/code_embeddings_flat1.csv and ./data/code_embeddings_flat1.pkl!


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Bidirectional
import re

def fix_array_format(array_str):
    array_str = re.sub(r'(?<=\d)\s+(?=\d)', ', ', array_str)
    return np.array(eval(array_str))

file_path = './data/code_embeddings_flat1.csv'
df = pd.read_csv(file_path)

original_embeddings = df['original_code_emb_0'].apply(fix_array_format)
java_embeddings = df['java_code_emb_0'].apply(fix_array_format)
labels = df['Is_Equal'].values

max_seq_length = max(original_embeddings.apply(len).max(), java_embeddings.apply(len).max())
original_embeddings = pad_sequences(original_embeddings, maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')
java_embeddings = pad_sequences(java_embeddings, maxlen=max_seq_length, dtype='float32', padding='post', truncating='post')

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

original_embeddings = original_embeddings.reshape(original_embeddings.shape[0], original_embeddings.shape[1], 1)
java_embeddings = java_embeddings.reshape(java_embeddings.shape[0], java_embeddings.shape[1], 1)

X_train_orig, X_test_orig, X_train_java, X_test_java, y_train, y_test = train_test_split(
    original_embeddings, java_embeddings, labels, test_size=0.2, random_state=42
)


lstm_layer = Bidirectional(LSTM(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3), name='shared_bidirectional_lstm')

input_original = Input(shape=(original_embeddings.shape[1], 1), name='original_input')
input_java = Input(shape=(java_embeddings.shape[1], 1), name='java_input')

encoded_original = lstm_layer(input_original)
encoded_java = lstm_layer(input_java)

combined = Concatenate()([encoded_original, encoded_java])

dense = Dense(64, activation='relu')(combined)
dropout = Dropout(0.3)(dense)
output = Dense(1, activation='sigmoid')(dropout)

model = Model(inputs=[input_original, input_java], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    [X_train_orig, X_train_java], y_train,
    validation_data=([X_test_orig, X_test_java], y_test),
    epochs=30,
    batch_size=64,
    verbose=2,
    callbacks=[early_stopping]
)

train_loss, train_accuracy = model.evaluate([X_train_orig, X_train_java], y_train, verbose=2)
test_loss, test_accuracy = model.evaluate([X_test_orig, X_test_java], y_test, verbose=2)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

model.save('lstm_model.h5')

python_embedding_model = Model(inputs=input_original, outputs=encoded_original)
python_refined_embeddings = python_embedding_model.predict(original_embeddings)

java_embedding_model = Model(inputs=input_java, outputs=encoded_java)
java_refined_embeddings = java_embedding_model.predict(java_embeddings)

refined_embeddings_df = pd.DataFrame({
    'Python_Refined': list(python_refined_embeddings),
    'Java_Refined': list(java_refined_embeddings),
    'Is_Equal': labels
})

refined_embeddings_df.to_csv('refined_embeddings_with_labels.csv', index=False)

Epoch 1/30
6/6 - 12s - 2s/step - accuracy: 0.4714 - loss: 0.6934 - val_accuracy: 0.5208 - val_loss: 0.6929
Epoch 2/30
6/6 - 5s - 846ms/step - accuracy: 0.5026 - loss: 0.6933 - val_accuracy: 0.5208 - val_loss: 0.6925
Epoch 3/30
6/6 - 5s - 836ms/step - accuracy: 0.5208 - loss: 0.6920 - val_accuracy: 0.5208 - val_loss: 0.6925
Epoch 4/30
6/6 - 5s - 840ms/step - accuracy: 0.5286 - loss: 0.6918 - val_accuracy: 0.5208 - val_loss: 0.6925
Epoch 5/30
6/6 - 5s - 830ms/step - accuracy: 0.5078 - loss: 0.6944 - val_accuracy: 0.5208 - val_loss: 0.6929
Epoch 6/30
6/6 - 5s - 871ms/step - accuracy: 0.5286 - loss: 0.6915 - val_accuracy: 0.5208 - val_loss: 0.6926
Epoch 7/30
6/6 - 5s - 844ms/step - accuracy: 0.5234 - loss: 0.6915 - val_accuracy: 0.5208 - val_loss: 0.6924
Epoch 8/30
6/6 - 5s - 840ms/step - accuracy: 0.5208 - loss: 0.6917 - val_accuracy: 0.5208 - val_loss: 0.6925
Epoch 9/30
6/6 - 6s - 927ms/step - accuracy: 0.5078 - loss: 0.6929 - val_accuracy: 0.5208 - val_loss: 0.6925
Epoch 10/30
6/6 - 5s 

In [7]:
import pandas as pd
import numpy as np
import ast
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

lstm_output_file = 'refined_embeddings_with_labels.csv'
lstm_embeddings_df = pd.read_csv(lstm_output_file)

def parse_embeddings(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str))
    except (ValueError, SyntaxError):
        return np.array([])

py_embeddings = lstm_embeddings_df['Python_Refined'].apply(parse_embeddings)
java_embeddings = lstm_embeddings_df['Java_Refined'].apply(parse_embeddings)

embedding_length = py_embeddings.iloc[0].shape[0] 
py_embeddings = np.array([emb if emb.shape[0] == embedding_length else np.zeros(embedding_length) for emb in py_embeddings])
java_embeddings = np.array([emb if emb.shape[0] == embedding_length else np.zeros(embedding_length) for emb in java_embeddings])

def create_simplified_siamese_network(input_dim):
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    l1_distance = Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))
    l1_dist = l1_distance([input_a, input_b])
    
    output = Dense(1, activation='sigmoid')(l1_dist)

    siamese_model = Model(inputs=[input_a, input_b], outputs=output)
    siamese_model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return siamese_model

input_dim = py_embeddings.shape[1]
siamese_model = create_simplified_siamese_network(input_dim)

similarity_scores = siamese_model.predict([py_embeddings, java_embeddings], verbose=0).flatten()


similarity_df = pd.DataFrame({
    'Python_Refined': lstm_embeddings_df['Python_Refined'],
    'Java_Refined': lstm_embeddings_df['Java_Refined'],
    'Is_Equal': lstm_embeddings_df['Is_Equal'],
    'Similarity_Score': similarity_scores
})
similarity_df.to_csv('siamese_similarity_scores.csv', index=False)

print("Similarity calculations completed. Results saved to siamese_similarity_scores.csv")


Similarity calculations completed. Results saved to siamese_similarity_scores.csv


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

df = pd.read_csv('siamese_similarity_scores.csv')

features = df['Similarity_Score'].values.reshape(-1, 1)

labels = df['Is_Equal'].values

print(f"Features shape: {features.shape}")  
print(f"Labels shape: {labels.shape}")      

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

def build_fcnn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')  # Binary classification output
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

fcnn_model = build_fcnn()

epochs = 30
batch_size = 16

history = fcnn_model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,
    verbose=1
)

y_pred = fcnn_model.predict(X_test).flatten()
y_pred_binary = (y_pred > 0.5).astype(int)  

test_accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

y_train_pred = fcnn_model.predict(X_train).flatten()
y_train_pred_binary = (y_train_pred > 0.5).astype(int)

train_accuracy = accuracy_score(y_train, y_train_pred_binary)
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")

fcnn_model.save('fcnn_model.h5')

predictions_df = pd.DataFrame({
    'True_Label': y_test,
    'Predicted_Probability': y_pred,
    'Predicted_Label': y_pred_binary
})
predictions_df.to_csv('fcnn_predictions.csv', index=False)

print("FCNN model and predictions saved.")


Features shape: (480, 1)
Labels shape: (480,)
Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 163ms/step - accuracy: 0.5814 - loss: 0.6841 - val_accuracy: 0.4559 - val_loss: 0.7058
Epoch 2/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5420 - loss: 0.6924 - val_accuracy: 0.4559 - val_loss: 0.7017
Epoch 3/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5319 - loss: 0.6915 - val_accuracy: 0.4559 - val_loss: 0.7014
Epoch 4/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5564 - loss: 0.6903 - val_accuracy: 0.4559 - val_loss: 0.7020
Epoch 5/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5385 - loss: 0.6911 - val_accuracy: 0.4559 - val_loss: 0.7013
Epoch 6/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5283 - loss: 0.6978 - val_accuracy: 0.4559 - val_loss: 0.7028
Epoch 7/30
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━