## 1. Import Library

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.model_selection import train_test_split
import os

# Download NLTK data
nltk.download('punkt')

# Set seed untuk reproducibility
tf.random.set_seed(42)
np.random.seed(42)

## 2. Load Dataset

Dataset structure should be like this,

Dataset Student_ChatGPT:

col 1 - Student (comes from 1 to 2 problem topics)

col 2 - ChatGPT (same as Student, comes from 1 to 2 problem topics)

---

Dataset Only_ChatGPT

col 1- ChatGPT (Knowledge of the material for 1 year or 2 semesters)

In [None]:
# Load both datasets
student_chatgpt = pd.read_csv("student_chatgpt.csv")
only_chatgpt = pd.read_csv("only_chatgpt.csv")

# Display basic information about the datasets
print("Student-ChatGPT Dataset Info:")
print(f"Number of rows: {len(student_chatgpt)}")
print("\nSample of student_chatgpt data:")
print(student_chatgpt.head(2))

print("\nOnly-ChatGPT Dataset Info:")
print(f"Number of rows: {len(only_chatgpt)}")
print("\nSample of only_chatgpt data:")
print(only_chatgpt.head(2))

## 3. Preprocessing Data

In [None]:
# Function to preprocess text
def preprocess_text(text):
    """Convert text to lowercase and segment into sentences"""
    text = str(text).lower()  # Convert to lowercase
    sentences = sent_tokenize(text)  # Split into sentences
    return sentences

# Process student essays
student_sentences = []
student_labels = []
for text in student_chatgpt['student']:
    sentences = preprocess_text(text)
    student_sentences.extend(sentences)
    student_labels.extend([0] * len(sentences))  # 0 for student

# Process ChatGPT responses from student_chatgpt dataset
chatgpt_sentences_1 = []
chatgpt_labels_1 = []
for text in student_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_1.extend(sentences)
    chatgpt_labels_1.extend([1] * len(sentences))  # 1 for ChatGPT

# Process only_chatgpt responses
chatgpt_sentences_2 = []
chatgpt_labels_2 = []
for text in only_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_2.extend(sentences)
    chatgpt_labels_2.extend([1] * len(sentences))

print(f"Number of student sentences: {len(student_sentences)}")
print(f"Number of ChatGPT sentences (from student_chatgpt): {len(chatgpt_sentences_1)}")
print(f"Number of ChatGPT sentences (from only_chatgpt): {len(chatgpt_sentences_2)}")

## 4. Inisialisasi BERT Tokenizer

In [None]:
# Initialize tokenizer for IndoBERT
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Function to tokenize text
def tokenize_text(texts, max_length=512):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

## 5. Membuat Model BERT Bi-Encoder

In [None]:
# Load model dasar IndoBERT
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

# Arsitektur untuk Bi-Encoder ChatGPT
input_ids_chatgpt = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask_chatgpt = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

# Dapatkan embeddings dari BERT
bert_outputs_chatgpt = bert_model(input_ids_chatgpt, attention_mask=attention_mask_chatgpt)[0]
cls_token_chatgpt = bert_outputs_chatgpt[:, 0, :]  # Ambil token [CLS]

# Layer-layer tambahan untuk fine-tuning
dense1_chatgpt = tf.keras.layers.Dense(256, activation="relu")(cls_token_chatgpt)
dropout1_chatgpt = tf.keras.layers.Dropout(0.1)(dense1_chatgpt)
dense2_chatgpt = tf.keras.layers.Dense(128, activation="relu")(dropout1_chatgpt)
dropout2_chatgpt = tf.keras.layers.Dropout(0.1)(dense2_chatgpt)
output_chatgpt = tf.keras.layers.Dense(128)(dropout2_chatgpt)

# Normalisasi output
normalized_output_chatgpt = tf.nn.l2_normalize(output_chatgpt, axis=1)

# Buat model Bi-Encoder untuk ChatGPT
bi_encoder_chatgpt = tf.keras.Model(
    inputs=[input_ids_chatgpt, attention_mask_chatgpt],
    outputs=normalized_output_chatgpt,
    name="bi_encoder_chatgpt"
)

In [None]:
# Arsitektur untuk Bi-Encoder Student (struktur yang sama, variable berbeda)
input_ids_student = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask_student = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

bert_outputs_student = bert_model(input_ids_student, attention_mask=attention_mask_student)[0]
cls_token_student = bert_outputs_student[:, 0, :]

dense1_student = tf.keras.layers.Dense(256, activation="relu")(cls_token_student)
dropout1_student = tf.keras.layers.Dropout(0.1)(dense1_student)
dense2_student = tf.keras.layers.Dense(128, activation="relu")(dropout1_student)
dropout2_student = tf.keras.layers.Dropout(0.1)(dense2_student)
output_student = tf.keras.layers.Dense(128)(dropout2_student)

normalized_output_student = tf.nn.l2_normalize(output_student, axis=1)

bi_encoder_student = tf.keras.Model(
    inputs=[input_ids_student, attention_mask_student],
    outputs=normalized_output_student,
    name="bi_encoder_student"
)

optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5)

## 6. Train Bi-Encoder

Problem resolved

In [None]:
# Implementasi contrastive loss
def contrastive_loss(margin=1.0):
    def loss(y_true, y_pred):
        # y_true: 1 untuk similar pairs, 0 untuk dissimilar pairs
        # y_pred: cosine similarity antara dua embeddings
        
        # Konversi similarity ke distance
        distance = 1 - y_pred
        
        # Loss untuk similar pairs
        positive_loss = y_true * tf.square(distance)
        
        # Loss untuk dissimilar pairs
        negative_loss = (1 - y_true) * tf.square(tf.maximum(margin - distance, 0))
        
        return tf.reduce_mean(positive_loss + negative_loss)
    return loss

In [None]:
# Stage 1: Training bi_encoder_chatgpt dengan data only_chatgpt

# Persiapkan data untuk training
print("Mempersiapkan data training untuk ChatGPT Bi-Encoder...")

# Tokenisasi data ChatGPT
chatgpt2_train_encodings = tokenize_text(chatgpt_sentences_2)

# Buat positive pairs dari kalimat yang berdekatan
batch_size = 32
train_steps = len(chatgpt_sentences_2) // batch_size

# Compile model dengan contrastive loss
bi_encoder_chatgpt.compile(
    optimizer=optimizer,
    loss=contrastive_loss(margin=1.0)
)

# Training
print("Training Bi-Encoder ChatGPT...")
bi_encoder_chatgpt.fit(
    {
        "input_ids": chatgpt2_train_encodings['input_ids'],
        "attention_mask": chatgpt2_train_encodings['attention_mask']
    },
    # Target matrix: diagonal adalah 1 (similar pairs), sisanya 0
    tf.eye(len(chatgpt_sentences_2)),
    epochs=3,
    batch_size=batch_size,
    validation_split=0.2
)

In [None]:
# Stage 2: Training bi_encoder_student dengan data student_chatgpt

# Persiapkan data training
print("Mempersiapkan data training untuk Student Bi-Encoder...")

# Tokenisasi data Student dan ChatGPT
student_train_encodings = tokenize_text(student_sentences)
chatgpt1_train_encodings = tokenize_text(chatgpt_sentences_1)

# Combine data untuk training
combined_input_ids = tf.concat([
    student_train_encodings['input_ids'],
    chatgpt1_train_encodings['input_ids']
], axis=0)

combined_attention_mask = tf.concat([
    student_train_encodings['attention_mask'],
    chatgpt1_train_encodings['attention_mask']
], axis=0)

# Buat similarity matrix
total_samples = len(student_sentences) + len(chatgpt_sentences_1)
similarity_matrix = tf.zeros((total_samples, total_samples))

# Set similarity 1 untuk pasangan dari sumber yang sama
student_size = len(student_sentences)
similarity_matrix = tf.tensor_scatter_nd_update(
    similarity_matrix,
    tf.where(tf.eye(student_size) > 0),
    tf.ones(student_size)
)

chatgpt_start = student_size
chatgpt_size = len(chatgpt_sentences_1)
similarity_matrix = tf.tensor_scatter_nd_update(
    similarity_matrix,
    tf.where(tf.eye(chatgpt_size) > 0) + chatgpt_start,
    tf.ones(chatgpt_size)
)

# Compile model
bi_encoder_student.compile(
    optimizer=optimizer,
    loss=contrastive_loss(margin=1.0)
)

# Training
print("Training Bi-Encoder Student...")
bi_encoder_student.fit(
    {
        "input_ids": combined_input_ids,
        "attention_mask": combined_attention_mask
    },
    similarity_matrix,
    epochs=3,
    batch_size=32,
    validation_split=0.2
)

In [None]:
# Print training summary
print("\nTraining Summary:")
print(f"ChatGPT Bi-Encoder training data size: {len(chatgpt_sentences_2)} sentences")
print(f"Student Bi-Encoder training data size: {len(student_sentences) + len(chatgpt_sentences_1)} sentences")

## 8. Evaluasi Bi-Encoder

In [None]:
# Siapkan data test untuk evaluasi
# Gunakan sebagian kecil data yang belum digunakan dalam training
test_student = student_sentences[-100:]  # Ambil 100 kalimat terakhir untuk testing
test_chatgpt_1 = chatgpt_sentences_1[-100:]  # Dari student_chatgpt dataset
test_chatgpt_2 = chatgpt_sentences_2[-100:]  # Dari only_chatgpt dataset

# Tokenisasi data test
test_student_encodings = tokenize_text(test_student)
test_chatgpt1_encodings = tokenize_text(test_chatgpt_1)
test_chatgpt2_encodings = tokenize_text(test_chatgpt_2)

# Generate embeddings untuk data test
print("Generating test embeddings...")
test_student_embeddings = bi_encoder_student.predict({
    "input_ids": test_student_encodings['input_ids'],
    "attention_mask": test_student_encodings['attention_mask']
})

test_chatgpt1_embeddings = bi_encoder_student.predict({
    "input_ids": test_chatgpt1_encodings['input_ids'],
    "attention_mask": test_chatgpt1_encodings['attention_mask']
})

test_chatgpt2_embeddings = bi_encoder_chatgpt.predict({
    "input_ids": test_chatgpt2_encodings['input_ids'],
    "attention_mask": test_chatgpt2_encodings['attention_mask']
})

# Hitung similarity scores
def calculate_similarity_metrics(embeddings1, embeddings2):
    """
    Menghitung similarity scores antara dua set embeddings
    Returns: mean similarity, min similarity, max similarity
    """
    # Normalisasi embeddings
    normalized_emb1 = tf.nn.l2_normalize(embeddings1, axis=1)
    normalized_emb2 = tf.nn.l2_normalize(embeddings2, axis=1)
    
    # Hitung similarity matrix
    similarity_matrix = tf.matmul(normalized_emb1, normalized_emb2, transpose_b=True)
    
    # Ambil metrics
    mean_sim = tf.reduce_mean(similarity_matrix)
    max_sim = tf.reduce_max(similarity_matrix)
    min_sim = tf.reduce_min(similarity_matrix)
    
    return mean_sim.numpy(), min_sim.numpy(), max_sim.numpy()

# Evaluasi similarity antara different types of text
print("\nEvaluasi Similarity Scores:")

# Student vs Student
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_student_embeddings[:50], test_student_embeddings[50:])
print("\nStudent vs Student Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

# Student vs ChatGPT (from student_chatgpt)
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_student_embeddings, test_chatgpt1_embeddings)
print("\nStudent vs ChatGPT (student_chatgpt) Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

# ChatGPT vs ChatGPT (across datasets)
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_chatgpt1_embeddings, test_chatgpt2_embeddings)
print("\nChatGPT vs ChatGPT (across datasets) Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

## 9. Simpan Model Bi-Encoder

In [None]:
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model Bi-Encoder untuk student_chatgpt
print("Menyimpan model Bi-Encoder student_chatgpt...")
bi_encoder_student.save('saved_models/bi_encoder_student_chatgpt.h5')

# Simpan model Bi-Encoder untuk only_chatgpt
print("Menyimpan model Bi-Encoder only_chatgpt...")
bi_encoder_chatgpt.save('saved_models/bi_encoder_only_chatgpt.h5')

# Simpan tokenizer configuration
tokenizer.save_pretrained('saved_models/tokenizer')

print("Model dan tokenizer berhasil disimpan!")

## 10. Generate Embeddings

In [None]:
print("Generating embeddings for all sentences...")

# Generate embeddings untuk student sentences
print("\nGenerating student embeddings...")
student_encodings = tokenize_text(student_sentences)
student_embeddings = bi_encoder_student.predict({
    "input_ids": student_encodings['input_ids'],
    "attention_mask": student_encodings['attention_mask']
})

# Generate embeddings untuk ChatGPT dari student_chatgpt
print("Generating ChatGPT (student_chatgpt) embeddings...")
chatgpt1_encodings = tokenize_text(chatgpt_sentences_1)
chatgpt1_embeddings = bi_encoder_student.predict({
    "input_ids": chatgpt1_encodings['input_ids'],
    "attention_mask": chatgpt1_encodings['attention_mask']
})

# Generate embeddings untuk ChatGPT dari only_chatgpt
print("Generating ChatGPT (only_chatgpt) embeddings...")
chatgpt2_encodings = tokenize_text(chatgpt_sentences_2)
chatgpt2_embeddings = bi_encoder_chatgpt.predict({
    "input_ids": chatgpt2_encodings['input_ids'],
    "attention_mask": chatgpt2_encodings['attention_mask']
})

# Simpan embeddings untuk penggunaan selanjutnya
print("\nMenyimpan embeddings...")
np.save('saved_models/student_embeddings.npy', student_embeddings)
np.save('saved_models/chatgpt1_embeddings.npy', chatgpt1_embeddings)
np.save('saved_models/chatgpt2_embeddings.npy', chatgpt2_embeddings)

print("\nSummary of generated embeddings:")
print(f"Student embeddings shape: {student_embeddings.shape}")
print(f"ChatGPT (student_chatgpt) embeddings shape: {chatgpt1_embeddings.shape}")
print(f"ChatGPT (only_chatgpt) embeddings shape: {chatgpt2_embeddings.shape}")

In [None]:
print("Mencari top 50 kalimat yang mirip untuk setiap kalimat...")

def get_top_k_similar(query_embedding, target_embeddings, k=50):
    """
    Mencari k kalimat yang paling mirip berdasarkan cosine similarity
    Args:
        query_embedding: embedding dari kalimat input (1 x embedding_size)
        target_embeddings: kumpulan embedding target (n x embedding_size)
        k: jumlah kalimat mirip yang diinginkan
    Returns:
        indices: indeks dari k kalimat yang paling mirip
        scores: nilai similarity untuk setiap kalimat
    """
    # Normalisasi embeddings
    query_normalized = tf.nn.l2_normalize(query_embedding, axis=1)
    target_normalized = tf.nn.l2_normalize(target_embeddings, axis=1)
    
    # Hitung similarity
    similarity_scores = tf.matmul(query_normalized, target_normalized, transpose_b=True)
    
    # Ambil top k
    scores, indices = tf.nn.top_k(similarity_scores[0], k=k)
    
    return indices.numpy(), scores.numpy()

# List untuk menyimpan pasangan kalimat mirip dan labelnya
similar_pairs = []
similar_labels = []

# Proses setiap kalimat student
for idx, student_embedding in enumerate(student_embeddings):
    print(f"\rMemproses kalimat ke-{idx+1}/{len(student_embeddings)}", end="")
    
    # Reshape embedding untuk matmul
    query_emb = tf.reshape(student_embedding, (1, -1))
    
    # 1. Cari 25 kalimat mirip dari data student
    student_indices, student_scores = get_top_k_similar(
        query_emb, student_embeddings, k=50
    )
    
    # 2. Cari 25 kalimat mirip dari data ChatGPT (gabungan)
    # Gabungkan embeddings ChatGPT dari kedua dataset
    combined_chatgpt_embeddings = np.vstack([chatgpt1_embeddings, chatgpt2_embeddings])
    chatgpt_indices, chatgpt_scores = get_top_k_similar(
        query_emb, combined_chatgpt_embeddings, k=50
    )
    
    # Tambahkan ke daftar pasangan
    # Pasangan dengan kalimat student
    for i, score in zip(student_indices, student_scores):
        if i != idx:  # Hindari pasangan dengan diri sendiri
            similar_pairs.append({
                'input_sentence': student_sentences[idx],
                'similar_sentence': student_sentences[i],
                'similarity_score': score,
                'label': 0  # 0 untuk pasangan student-student
            })
    
    # Pasangan dengan kalimat ChatGPT
    for i, score in zip(chatgpt_indices, chatgpt_scores):
        # Tentukan sumber kalimat ChatGPT (dataset 1 atau 2)
        if i < len(chatgpt_sentences_1):
            similar_sentence = chatgpt_sentences_1[i]
        else:
            similar_sentence = chatgpt_sentences_2[i - len(chatgpt_sentences_1)]
            
        similar_pairs.append({
            'input_sentence': student_sentences[idx],
            'similar_sentence': similar_sentence,
            'similarity_score': score,
            'label': 1  # 1 untuk pasangan student-chatgpt
        })

print("\n\nStatistik hasil pencarian kalimat mirip:")
print(f"Total pasangan kalimat yang dihasilkan: {len(similar_pairs)}")
print(f"Jumlah pasangan student-student: {sum(1 for pair in similar_pairs if pair['label'] == 0)}")
print(f"Jumlah pasangan student-chatgpt: {sum(1 for pair in similar_pairs if pair['label'] == 1)}")

# Simpan hasil untuk digunakan di Cross-Encoder
import pickle
with open('saved_models/similar_pairs.pkl', 'wb') as f:
    pickle.dump(similar_pairs, f)

print("\nHasil pencarian kalimat mirip telah disimpan!")

## 11. Membuat Model BERT Cross-Encoder

In [None]:
# Input layer untuk Cross-Encoder
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

# BERT embeddings
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)[0]
cls_token = bert_outputs[:, 0, :]  # Representasi token CLS

# Dense layers untuk klasifikasi
dense1 = tf.keras.layers.Dense(256, activation="relu")(cls_token)
dropout1 = tf.keras.layers.Dropout(0.1)(dense1)
dense2 = tf.keras.layers.Dense(64, activation="relu")(dropout1)
dropout2 = tf.keras.layers.Dropout(0.1)(dense2)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout2)

# Model Cross-Encoder
cross_encoder = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Kompilasi Cross-Encoder
cross_encoder.compile(
    optimizer=optimizer,
    loss="binary_crossentropy",
    metrics=["accuracy", "precision", "recall", "AUC"]
)

## 12. Membuat Pasangan Data untuk Cross-Encoder

In [None]:
# Prepare cross-encoder training data
cross_encoder_inputs = []
for pair in similar_pairs:
    encoding = tokenizer(
        pair['input_sentence'],
        pair['similar_sentence'],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    cross_encoder_inputs.append({
        "input_ids": encoding['input_ids'][0],
        "attention_mask": encoding['attention_mask'][0]
    })

# Convert to tensors
input_ids = tf.stack([x["input_ids"] for x in cross_encoder_inputs])
attention_masks = tf.stack([x["attention_mask"] for x in cross_encoder_inputs])
labels = tf.convert_to_tensor(similar_labels)

## 13. Train Cross-Encoder

In [None]:
cross_encoder.fit(
    {"input_ids": input_ids, "attention_mask": attention_masks},
    labels,
    epochs=3,
    batch_size=16,
    validation_split=0.2
)

## 14. Evaluasi Model

In [None]:
test_pairs = similar_pairs[-1000:]  # Take last 1000 pairs for testing
test_encodings = []
test_labels = []

for pair in test_pairs:
    encoding = tokenizer(
        pair['input_sentence'],
        pair['similar_sentence'],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    test_encodings.append({
        "input_ids": encoding['input_ids'][0],
        "attention_mask": encoding['attention_mask'][0]
    })
    test_labels.append(pair['label'])

test_input_ids = tf.stack([x["input_ids"] for x in test_encodings])
test_attention_masks = tf.stack([x["attention_mask"] for x in test_encodings])
test_labels = tf.convert_to_tensor(test_labels)

results = cross_encoder.evaluate(
    {"input_ids": test_input_ids, "attention_mask": test_attention_masks},
    test_labels
)

## 15. Simpan Model

In [None]:
# Save the models
bi_encoder_student.save("bi_encoder_student_model.h5")
bi_encoder_chatgpt.save("bi_encoder_chatgpt_model.h5")
cross_encoder.save("cross_encoder_model.h5")

print("All models have been saved successfully!")