## 1. Import Library

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.model_selection import train_test_split
import os

# Download NLTK data
nltk.download('punkt')

# Set seed untuk reproducibility
tf.random.set_seed(42)
np.random.seed(42)

## 2. Load Dataset

In [None]:
# Load both datasets
student_chatgpt = pd.read_csv("student_chatgpt.csv")
only_chatgpt = pd.read_csv("only_chatgpt.csv")

# Display basic information about the datasets
print("Student-ChatGPT Dataset Info:")
print(f"Number of rows: {len(student_chatgpt)}")
print("\nSample of student_chatgpt data:")
print(student_chatgpt.head(2))

print("\nOnly-ChatGPT Dataset Info:")
print(f"Number of rows: {len(only_chatgpt)}")
print("\nSample of only_chatgpt data:")
print(only_chatgpt.head(2))

## 3. Preprocessing Data

In [None]:
# Function to preprocess text
def preprocess_text(text):
    """Convert text to lowercase and segment into sentences"""
    text = str(text).lower()  # Convert to lowercase
    sentences = sent_tokenize(text)  # Split into sentences
    return sentences

# Process student essays
student_sentences = []
student_labels = []
for text in student_chatgpt['student']:
    sentences = preprocess_text(text)
    student_sentences.extend(sentences)
    student_labels.extend([0] * len(sentences))  # 0 for student

# Process ChatGPT responses from student_chatgpt dataset
chatgpt_sentences_1 = []
chatgpt_labels_1 = []
for text in student_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_1.extend(sentences)
    chatgpt_labels_1.extend([1] * len(sentences))  # 1 for ChatGPT

# Process only_chatgpt responses
chatgpt_sentences_2 = []
chatgpt_labels_2 = []
for text in only_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_2.extend(sentences)
    chatgpt_labels_2.extend([1] * len(sentences))

print(f"Number of student sentences: {len(student_sentences)}")
print(f"Number of ChatGPT sentences (from student_chatgpt): {len(chatgpt_sentences_1)}")
print(f"Number of ChatGPT sentences (from only_chatgpt): {len(chatgpt_sentences_2)}")

## 4. Inisialisasi BERT Tokenizer

In [None]:
# Initialize tokenizer for IndoBERT
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Function to tokenize text
def tokenize_text(texts, max_length=512):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

## 5. Membuat Model BERT Bi-Encoder

In [None]:
# Load base BERT model
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

# Create Bi-Encoder for student data
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

# BERT embeddings
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)[0]
cls_token = bert_outputs[:, 0, :]  # Get [CLS] token representation

# Dense layers for embedding
dense1 = tf.keras.layers.Dense(256, activation="relu")(cls_token)
dropout = tf.keras.layers.Dropout(0.1)(dense1)
output = tf.keras.layers.Dense(128)(dropout)

# Normalize output embedding
normalized_output = tf.nn.l2_normalize(output, axis=1)

# Create the models
bi_encoder_student = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=normalized_output)
bi_encoder_chatgpt = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=normalized_output)

# Compile the models
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
bi_encoder_student.compile(optimizer=optimizer, loss=tf.keras.losses.CosineSimilarity())
bi_encoder_chatgpt.compile(optimizer=optimizer, loss=tf.keras.losses.CosineSimilarity())

## 6. Train Bi-Encoder

training masi salah logic

In [None]:
# Prepare training data for student bi-encoder
student_train_encodings = tokenize_text(student_sentences)
chatgpt1_train_encodings = tokenize_text(chatgpt_sentences_1)
chatgpt2_train_encodings = tokenize_text(chatgpt_sentences_2)

# Train student bi-encoder
print("Training Student Bi-Encoder...")
bi_encoder_student.fit(
    {"input_ids": student_train_encodings['input_ids'],
     "attention_mask": student_train_encodings['attention_mask']},
    np.zeros(len(student_sentences)),  # Dummy target for self-supervised learning
    epochs=3,
    batch_size=16,
    validation_split=0.2
)

# Train ChatGPT bi-encoder
print("\nTraining ChatGPT Bi-Encoder...")
bi_encoder_chatgpt.fit(
    {"input_ids": tf.concat([chatgpt1_train_encodings['input_ids'], 
                           chatgpt2_train_encodings['input_ids']], axis=0),
     "attention_mask": tf.concat([chatgpt1_train_encodings['attention_mask'],
                                chatgpt2_train_encodings['attention_mask']], axis=0)},
    np.zeros(len(chatgpt_sentences_1) + len(chatgpt_sentences_2)),  # Dummy target
    epochs=3,
    batch_size=16,
    validation_split=0.2
)

## 8. Evaluasi Bi-Encoder

In [None]:
# Evaluasi Bi-Encoder student_chatgpt
student_test_embeddings = bi_encoder_student.predict({"input_ids": student_test_encodings['input_ids'], "attention_mask": student_test_encodings['attention_mask']})
print(f"Shape embeddings student_chatgpt: {student_test_embeddings.shape}")

# Evaluasi Bi-Encoder only_chatgpt
chatgpt_test_embeddings = bi_encoder_chatgpt.predict({"input_ids": chatgpt_test_encodings['input_ids'], "attention_mask": chatgpt_test_encodings['attention_mask']})
print(f"Shape embeddings only_chatgpt: {chatgpt_test_embeddings.shape}")

## 9. Simpan Model Bi-Encoder

In [None]:
# Simpan model Bi-Encoder untuk dataset student_chatgpt
bi_encoder_student.save("bi_encoder_student_chatgpt_model")

# Simpan model Bi-Encoder untuk dataset only_chatgpt
bi_encoder_chatgpt.save("bi_encoder_only_chatgpt_model")

print("Model Bi-Encoder telah disimpan.")

## 10. Generate Embeddings

In [None]:
# Generate embeddings
student_embeddings = bi_encoder_student.predict(student_train_encodings)
chatgpt_embeddings = bi_encoder_chatgpt.predict(tf.concat([chatgpt1_train_encodings['input_ids'],
                                                         chatgpt2_train_encodings['input_ids']], axis=0))

# Function to compute similarity and get top-k
def compute_similarity_and_top_k(embeddings1, embeddings2, k=50):
    normalized_emb1 = tf.nn.l2_normalize(embeddings1, axis=1)
    normalized_emb2 = tf.nn.l2_normalize(embeddings2, axis=1)
    similarity_matrix = tf.matmul(normalized_emb1, normalized_emb2, transpose_b=True)
    return tf.nn.top_k(similarity_matrix, k=k)

# Get similar sentences for each input
similar_pairs = []
similar_labels = []

# Process each student sentence
for i, student_sent in enumerate(student_sentences):
    # Get top-25 similar student sentences
    _, student_indices = compute_similarity_and_top_k(
        student_embeddings[i:i+1], student_embeddings, k=25)
    
    # Get top-25 similar ChatGPT sentences
    _, chatgpt_indices = compute_similarity_and_top_k(
        student_embeddings[i:i+1], chatgpt_embeddings, k=25)
    
    # Add pairs to training data
    for idx in student_indices[0]:
        similar_pairs.append((student_sent, student_sentences[idx]))
        similar_labels.append(0)
    
    for idx in chatgpt_indices[0]:
        if idx < len(chatgpt_sentences_1):
            similar_pairs.append((student_sent, chatgpt_sentences_1[idx]))
        else:
            similar_pairs.append((student_sent, chatgpt_sentences_2[idx - len(chatgpt_sentences_1)]))
        similar_labels.append(1)

## 11. Membuat Model BERT Cross-Encoder

In [None]:
# Input layer untuk Cross-Encoder
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

# BERT embeddings
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)[0]
cls_token = bert_outputs[:, 0, :]  # Representasi token CLS

# Dense layers untuk klasifikasi
dense1 = tf.keras.layers.Dense(256, activation="relu")(cls_token)
dropout1 = tf.keras.layers.Dropout(0.1)(dense1)
dense2 = tf.keras.layers.Dense(64, activation="relu")(dropout1)
dropout2 = tf.keras.layers.Dropout(0.1)(dense2)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout2)

# Model Cross-Encoder
cross_encoder = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Kompilasi Cross-Encoder
cross_encoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

## 12. Membuat Pasangan Data untuk Cross-Encoder

In [None]:
# Prepare cross-encoder training data
cross_encoder_inputs = []
for pair in similar_pairs:
    encoding = tokenizer(
        pair[0],
        pair[1],
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="tf"
    )
    cross_encoder_inputs.append({
        "input_ids": encoding['input_ids'][0],
        "attention_mask": encoding['attention_mask'][0]
    })

# Convert to tensors
input_ids = tf.stack([x["input_ids"] for x in cross_encoder_inputs])
attention_masks = tf.stack([x["attention_mask"] for x in cross_encoder_inputs])
labels = tf.convert_to_tensor(similar_labels)

## 13. Train Cross-Encoder

In [None]:
cross_encoder.fit(
    {"input_ids": input_ids, "attention_mask": attention_masks},
    labels,
    epochs=3,
    batch_size=16,
    validation_split=0.2
)

## 14. Evaluasi Model

In [None]:
results = cross_encoder.evaluate(
    {"input_ids": test_encodings['input_ids'], "attention_mask": test_encodings['attention_mask']},
    y_test
)
print(f"Cross-Encoder Accuracy: {results[1]:.4f}")

## 15. Simpan Model

In [None]:
# Save the models
bi_encoder_student.save("bi_encoder_student_model.h5")
bi_encoder_chatgpt.save("bi_encoder_chatgpt_model.h5")
cross_encoder.save("cross_encoder_model.h5")

print("All models have been saved successfully!")