In [None]:
#Import Library

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import pickle
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load both datasets
student_chatgpt = pd.read_excel("example_datasets/examples-datasets-mar25.xlsx")
only_chatgpt = pd.read_excel("example_datasets/knowledge-datasets-mar25.xlsx")

In [None]:
# Tampilkan informasi dataset
print("Dataset Student_ChatGPT:")
student_chatgpt.info()
print("\nDataset Only_ChatGPT:")
only_chatgpt.info()

# Tampilkan beberapa baris awal dataset
print("\nContoh Data Student_ChatGPT:")
student_chatgpt.head()

print("\nContoh Data Only_ChatGPT:")
only_chatgpt.head()

In [None]:
def preprocess_text(text):
    """
    Preprocessing teks:
    - Mengubah teks menjadi huruf kecil
    - Membersihkan whitespace berlebih
    - Menjaga teks sebagai paragraf utuh
    
    Args:
        text (str): Teks input.
        
    Returns:
        str: Teks yang telah diproses atau None jika tidak valid.
    """
    if not isinstance(text, str) or text.strip() == "":
        return None
    
    # Bersihkan teks
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Ganti multiple whitespace dengan satu spasi
    return text

In [None]:
# Preprocessing data Student_ChatGPT
std_par = []
gpt_par_1 = []  # ChatGPT dari dataset Student_ChatGPT

# Proses teks Student
for text in student_chatgpt['Pelajar']:
    processed = preprocess_text(text)
    if processed:
        std_par.append(processed)

# Proses teks ChatGPT (dari Student_ChatGPT)
for text in student_chatgpt['GPT']:
    processed = preprocess_text(text)
    if processed:
        gpt_par_1.append(processed)

# Preprocessing data Only_ChatGPT
gpt_par_2 = []  # ChatGPT dari dataset Only_ChatGPT

# Proses teks ChatGPT (dari Only_ChatGPT)
for text in only_chatgpt['GPT']:
    processed = preprocess_text(text)
    if processed:
        gpt_par_2.append(processed)

# Tampilkan jumlah data hasil preprocessing
print(f"Total paragraf Student: {len(std_par)}")
print(f"Total paragraf ChatGPT (Student_ChatGPT): {len(gpt_par_1)}")
print(f"Total paragraf ChatGPT (Only_ChatGPT): {len(gpt_par_2)}")

In [None]:
# Inisialisasi tokenizer IndoBERT
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Fungsi untuk tokenisasi teks
def tokenize_text(texts, max_length=256):
    """
    Tokenisasi teks menggunakan tokenizer IndoBERT.
    
    Args:
        texts (list): Daftar teks yang akan di-tokenisasi.
        max_length (int): Panjang maksimum token.
        
    Returns:
        dict: Token hasil tokenisasi, termasuk input_ids dan attention_mask.
    """
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

In [None]:
# Tokenisasi data
print("Tokenisasi kalimat Student...")
std_tokens = tokenize_text(std_par)

print("Tokenisasi kalimat ChatGPT (Student_ChatGPT)...")
gpt_tokens_1 = tokenize_text(gpt_par_1)

print("Tokenisasi kalimat ChatGPT (Only_ChatGPT)...")
gpt_tokens_2 = tokenize_text(gpt_par_2)

# Tampilkan hasil tokenisasi (contoh: Student)
print("\nContoh hasil tokenisasi:")
print(std_tokens['input_ids'][:1])  # Input token ID
print(std_tokens['attention_mask'][:1])  # Attention mask

In [None]:
# Memeriksa distribusi panjang token untuk memastikan max_length cukup
student_lengths = [sum(mask) for mask in std_tokens['attention_mask'].numpy()]
plt.figure(figsize=(10, 6))
plt.hist(student_lengths, bins=30)
plt.title('Distribusi Panjang Token (Student)')
plt.xlabel('Jumlah Token Aktif')
plt.ylabel('Frekuensi')
plt.show()
print(f"Rata-rata panjang token: {np.mean(student_lengths):.2f}")
print(f"Persentase terpotong: {sum(l == 128 for l in student_lengths) / len(student_lengths) * 100:.2f}%")

In [None]:
# Mendekode token untuk memastikan tokenisasi berfungsi dengan baik
sample_text = std_par[0]
sample_tokens = tokenizer.encode(sample_text)
print(f"Teks asli: {sample_text}")
print(f"Token ID: {sample_tokens}")
print(f"Token dekode: {tokenizer.decode(sample_tokens)}")

In [None]:
# Jika tokenisasi memakan waktu lama, pertimbangkan untuk menyimpannya
tokenized_data = {
    'student': std_tokens,
    'chatgpt_1 ': gpt_tokens_1 ,
    'chatgpt_2': gpt_tokens_2
}

# Menyimpan input_ids dan attention_mask sebagai numpy arrays
tokenized_numpy = {
    'student': {
        'input_ids': std_tokens['input_ids'].numpy(),
        'attention_mask': std_tokens['attention_mask'].numpy()
    },
    'chatgpt ': {
        'input_ids': gpt_tokens_1 ['input_ids'].numpy(),
        'attention_mask': gpt_tokens_1 ['attention_mask'].numpy()
    },
    'chatgpt_2': {
        'input_ids': gpt_tokens_2['input_ids'].numpy(),
        'attention_mask': gpt_tokens_2['attention_mask'].numpy()
    }
}

with open('tokenized_data.pkl', 'wb') as f:
    pickle.dump(tokenized_numpy, f)

In [None]:
# Load model IndoBERT
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

#Buat Model fine tuning Bi-Encoder Student_ChatGPT

#Buat Model fine tuning Bi-Encoder Only_ChatGPT

In [None]:
# Load model IndoBERT
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

# Freeze BERT layers
for layer in bert_model.layers:
    layer.trainable = False


# Fungsi untuk membuat Bi-Encoder
def build_bi_encoder(bert_model):
    """
    Membuat model Bi-Encoder dengan IndoBERT.
    
    Args:
        bert_model (TFBertModel): Model dasar IndoBERT.
        
    Returns:
        tf.keras.Model: Model Bi-Encoder.
    """
    # Input layer untuk token ID dan attention mask
    input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="attention_mask")
    
    # Extract CLS token embeddings dari IndoBERT
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]  # [CLS] token
    
    # Dense layer untuk fine-tuning
    dense1 = tf.keras.layers.Dense(256, activation="relu")(bert_output)
    dropout1 = tf.keras.layers.Dropout(0.1)(dense1)
    dense2 = tf.keras.layers.Dense(128, activation="relu")(dropout1)
    dropout2 = tf.keras.layers.Dropout(0.1)(dense2)
    dense3 = tf.keras.layers.Dense(128)(dropout2)
    
    # Normalisasi output (L2 normalization)
    normalized_output = tf.nn.l2_normalize(dense3, axis=1)
    
    # Model Bi-Encoder
    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=normalized_output)

# Buat dua Bi-Encoder
bi_encoder_student_chatgpt = build_bi_encoder(bert_model)
bi_encoder_only_chatgpt = build_bi_encoder(bert_model)

# Tampilkan arsitektur
print("Bi-Encoder untuk Student_ChatGPT:")
bi_encoder_student_chatgpt.summary()


In [None]:
#Training Datasets Student_ChatGPT (Gunakan Model Fine-Tuning IndoBERT di atas)

In [None]:
# Fungsi untuk membuat pasangan data untuk contrastive learning
def create_contrastive_pairs(student_tokens, chatgpt_tokens, n_pairs=5000):
    """
    Membuat pasangan data untuk contrastive learning.
    
    Args:
        student_tokens: Token dari teks student.
        chatgpt_tokens: Token dari teks ChatGPT.
        n_pairs: Jumlah pasangan yang akan dibuat.
        
    Returns:
        tuple: Pasangan anchor, positive, negative, dan label.
    """
    # Jumlah data
    n_student = student_tokens['input_ids'].shape[0]
    n_chatgpt = chatgpt_tokens['input_ids'].shape[0]
    
    # Inisialisasi array untuk pasangan data
    anchor_input_ids = []
    anchor_attention_mask = []
    positive_input_ids = []
    positive_attention_mask = []
    negative_input_ids = []
    negative_attention_mask = []
    labels = []
    
    # Membuat pasangan positif (student-student)
    for _ in range(n_pairs // 4):
        idx1, idx2 = np.random.choice(n_student, 2, replace=False)
        
        anchor_input_ids.append(student_tokens['input_ids'][idx1])
        anchor_attention_mask.append(student_tokens['attention_mask'][idx1])
        
        positive_input_ids.append(student_tokens['input_ids'][idx2])
        positive_attention_mask.append(student_tokens['attention_mask'][idx2])
        
        # Negative dari ChatGPT
        neg_idx = np.random.choice(n_chatgpt)
        negative_input_ids.append(chatgpt_tokens['input_ids'][neg_idx])
        negative_attention_mask.append(chatgpt_tokens['attention_mask'][neg_idx])
        
        labels.append(1)  # 1 untuk pasangan positif
    
    # Membuat pasangan positif (chatgpt-chatgpt)
    for _ in range(n_pairs // 4):
        idx1, idx2 = np.random.choice(n_chatgpt, 2, replace=False)
        
        anchor_input_ids.append(chatgpt_tokens['input_ids'][idx1])
        anchor_attention_mask.append(chatgpt_tokens['attention_mask'][idx1])
        
        positive_input_ids.append(chatgpt_tokens['input_ids'][idx2])
        positive_attention_mask.append(chatgpt_tokens['attention_mask'][idx2])
        
        # Negative dari Student
        neg_idx = np.random.choice(n_student)
        negative_input_ids.append(student_tokens['input_ids'][neg_idx])
        negative_attention_mask.append(student_tokens['attention_mask'][neg_idx])
        
        labels.append(1)  # 1 untuk pasangan positif
    
    # Membuat pasangan negatif (student-chatgpt)
    for _ in range(n_pairs // 2):
        student_idx = np.random.choice(n_student)
        chatgpt_idx = np.random.choice(n_chatgpt)
        
        anchor_input_ids.append(student_tokens['input_ids'][student_idx])
        anchor_attention_mask.append(student_tokens['attention_mask'][student_idx])
        
        negative_input_ids.append(chatgpt_tokens['input_ids'][chatgpt_idx])
        negative_attention_mask.append(chatgpt_tokens['attention_mask'][chatgpt_idx])
        
        # Positive dari Student (berbeda dengan anchor)
        pos_idx = np.random.choice([i for i in range(n_student) if i != student_idx])
        positive_input_ids.append(student_tokens['input_ids'][pos_idx])
        positive_attention_mask.append(student_tokens['attention_mask'][pos_idx])
        
        labels.append(0)  # 0 untuk pasangan negatif
    
    # Konversi ke tensor
    return {
        'anchor': {
            'input_ids': tf.convert_to_tensor(anchor_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(anchor_attention_mask, dtype=tf.int32)
        },
        'positive': {
            'input_ids': tf.convert_to_tensor(positive_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(positive_attention_mask, dtype=tf.int32)
        },
        'negative': {
            'input_ids': tf.convert_to_tensor(negative_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(negative_attention_mask, dtype=tf.int32)
        },
        'labels': tf.convert_to_tensor(labels, dtype=tf.float32)
    }

# Buat pasangan data untuk model Student_ChatGPT
student_chatgpt_pairs = create_contrastive_pairs(std_tokens, gpt_tokens_1, n_pairs=1000)


In [None]:
# Implementasi contrastive loss
def contrastive_loss(y_true, y_pred):
    """
    Contrastive loss untuk triplet (anchor, positive, negative).
    
    Args:
        y_true: Label (1 untuk pasangan positif, 0 untuk pasangan negatif).
        y_pred: Jarak antara anchor-positive dan anchor-negative.
        
    Returns:
        tf.Tensor: Nilai loss.
    """
    margin = 0.5
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [None]:
# Model untuk training dengan triplet loss
def build_triplet_model(bi_encoder):
    """
    Membangun model untuk training dengan triplet loss.
    
    Args:
        bi_encoder: Model bi-encoder yang akan dilatih.
        
    Returns:
        tf.keras.Model: Model untuk training dengan triplet loss.
    """
    # Input untuk anchor, positive, dan negative
    anchor_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="anchor_input_ids")
    anchor_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="anchor_attention_mask")
    
    positive_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="positive_input_ids")
    positive_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="positive_attention_mask")
    
    negative_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="negative_input_ids")
    negative_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="negative_attention_mask")
    
    # Embedding untuk anchor, positive, dan negative
    anchor_embedding = bi_encoder([anchor_input_ids, anchor_attention_mask])
    positive_embedding = bi_encoder([positive_input_ids, positive_attention_mask])
    negative_embedding = bi_encoder([negative_input_ids, negative_attention_mask])
    
    # Hitung cosine similarity
    pos_similarity = tf.reduce_sum(anchor_embedding * positive_embedding, axis=1)
    neg_similarity = tf.reduce_sum(anchor_embedding * negative_embedding, axis=1)
    
    # Output model adalah perbedaan similarity
    output = tf.stack([pos_similarity, neg_similarity], axis=1)
    
    return tf.keras.Model(
        inputs=[
            anchor_input_ids, anchor_attention_mask,
            positive_input_ids, positive_attention_mask,
            negative_input_ids, negative_attention_mask
        ],
        outputs=output
    )

In [None]:
# Buat model triplet untuk Student_ChatGPT
triplet_model_student_chatgpt = build_triplet_model(bi_encoder_student_chatgpt)

# Custom loss function untuk triplet
def triplet_loss(y_true, y_pred):
    """
    Triplet loss: mendorong similarity positif lebih tinggi dari similarity negatif.
    
    Args:
        y_true: Tidak digunakan dalam triplet loss.
        y_pred: Stack dari [positive_similarity, negative_similarity].
        
    Returns:
        tf.Tensor: Nilai loss.
    """
    pos_sim = y_pred[:, 0]
    neg_sim = y_pred[:, 1]
    margin = 0.5
    
    # Triplet loss: max(0, margin - (pos_sim - neg_sim))
    loss = tf.maximum(0., margin - (pos_sim - neg_sim))
    return tf.reduce_mean(loss)

In [None]:
# Compile model
triplet_model_student_chatgpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=triplet_loss
)

# Training model Student_ChatGPT
print("Training Bi-Encoder untuk Student_ChatGPT...")
history_student_chatgpt = triplet_model_student_chatgpt.fit(
    x=[
        student_chatgpt_pairs['anchor']['input_ids'],
        student_chatgpt_pairs['anchor']['attention_mask'],
        student_chatgpt_pairs['positive']['input_ids'],
        student_chatgpt_pairs['positive']['attention_mask'],
        student_chatgpt_pairs['negative']['input_ids'],
        student_chatgpt_pairs['negative']['attention_mask']
    ],
    y=student_chatgpt_pairs['labels'],  # Tidak digunakan dalam triplet loss
    epochs=3,
    batch_size=32,
    validation_split=0.3,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    ]
)

In [None]:
#Training Datasets Only_ChatGPT (Self-Supervised, Gunakan Model Fine-Tuning IndoBERT di atas)

In [None]:
# Buat pasangan data untuk model Only_ChatGPT (self-supervised)
def create_self_supervised_pairs(tokens, n_pairs=5000):
    """
    Membuat pasangan data untuk self-supervised learning.
    
    Args:
        tokens: Token dari teks.
        n_pairs: Jumlah pasangan yang akan dibuat.
        
    Returns:
        tuple: Pasangan anchor, positive, dan label.
    """
    # Jumlah data
    n_samples = tokens['input_ids'].shape[0]
    
    # Inisialisasi array untuk pasangan data
    anchor_input_ids = []
    anchor_attention_mask = []
    positive_input_ids = []
    positive_attention_mask = []
    negative_input_ids = []
    negative_attention_mask = []
    
    # Membuat pasangan positif (similar texts)
    for _ in range(n_pairs):
        # Pilih dua indeks berbeda secara acak
        idx1, idx2, idx3 = np.random.choice(n_samples, 3, replace=False)
        
        anchor_input_ids.append(tokens['input_ids'][idx1])
        anchor_attention_mask.append(tokens['attention_mask'][idx1])
        
        positive_input_ids.append(tokens['input_ids'][idx2])
        positive_attention_mask.append(tokens['attention_mask'][idx2])
        
        negative_input_ids.append(tokens['input_ids'][idx3])
        negative_attention_mask.append(tokens['attention_mask'][idx3])
    
    # Konversi ke tensor
    return {
        'anchor': {
            'input_ids': tf.convert_to_tensor(anchor_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(anchor_attention_mask, dtype=tf.int32)
        },
        'positive': {
            'input_ids': tf.convert_to_tensor(positive_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(positive_attention_mask, dtype=tf.int32)
        },
        'negative': {
            'input_ids': tf.convert_to_tensor(negative_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(negative_attention_mask, dtype=tf.int32)
        },
        'labels': tf.ones(n_pairs, dtype=tf.float32)  # Dummy labels
    }

# Buat pasangan data untuk model Only_ChatGPT
only_chatgpt_pairs = create_self_supervised_pairs(gpt_tokens_2, n_pairs=1000)

In [None]:
# Buat model triplet untuk Only_ChatGPT
triplet_model_only_chatgpt = build_triplet_model(bi_encoder_only_chatgpt)

# Compile model
triplet_model_only_chatgpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=triplet_loss
)

# Training model Only_ChatGPT
print("Training Bi-Encoder untuk Only_ChatGPT...")
history_only_chatgpt = triplet_model_only_chatgpt.fit(
    x=[
        only_chatgpt_pairs['anchor']['input_ids'],
        only_chatgpt_pairs['anchor']['attention_mask'],
        only_chatgpt_pairs['positive']['input_ids'],
        only_chatgpt_pairs['positive']['attention_mask'],
        only_chatgpt_pairs['negative']['input_ids'],
        only_chatgpt_pairs['negative']['attention_mask']
    ],
    y=only_chatgpt_pairs['labels'],  # Tidak digunakan dalam triplet loss
    epochs=3,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    ]
)

In [None]:
# Plot training history

In [None]:
# Plot training history untuk model Student_ChatGPT
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_student_chatgpt.history['loss'], label='Training Loss')
plt.plot(history_student_chatgpt.history['val_loss'], label='Validation Loss')
plt.title('Model Student_ChatGPT: Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot training history untuk model Only_ChatGPT
plt.subplot(1, 2, 2)
plt.plot(history_only_chatgpt.history['loss'], label='Training Loss')
plt.plot(history_only_chatgpt.history['val_loss'], label='Validation Loss')
plt.title('Model Only_ChatGPT: Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Generate embeddings untuk kalimat Student, ChatGPT_1 dan ChatGPT_2

# Fungsi untuk menghitung similarity score

# Hitung similarity scores X-Student (Model Student_ChatGPT), X-ChatGPT1 (Model Student_ChatGPT), X-ChatGPT2 (Model Only_ChatGPT)

In [None]:
# Fungsi untuk menghasilkan embeddings
def generate_embeddings(tokens, model):
    """
    Menghasilkan embeddings untuk teks.
    
    Args:
        tokens: Token dari teks.
        model: Model bi-encoder.
        
    Returns:
        tf.Tensor: Embeddings.
    """
    return model([tokens['input_ids'], tokens['attention_mask']])

# Fungsi untuk menghitung similarity score
def compute_similarity(embedding1, embedding2):
    """
    Menghitung cosine similarity antara dua embeddings dengan normalisasi L2.
    
    Args:
        embedding1: Embedding pertama.
        embedding2: Embedding kedua.
        
    Returns:
        np.ndarray: Similarity score antara 0 dan 1.
    """
    # Pastikan embedding1 dan embedding2 adalah tensor
    if not isinstance(embedding1, tf.Tensor):
        embedding1 = tf.convert_to_tensor(embedding1, dtype=tf.float32)
    if not isinstance(embedding2, tf.Tensor):
        embedding2 = tf.convert_to_tensor(embedding2, dtype=tf.float32)
    
    # Debug info
    print(f"Embedding1 shape: {embedding1.shape}")
    print(f"Embedding2 shape: {embedding2.shape}")
    
    # Normalisasi embeddings (L2 norm)
    embedding1_norm = tf.nn.l2_normalize(embedding1, axis=-1)
    embedding2_norm = tf.nn.l2_normalize(embedding2, axis=-1)
    
    # Reshape embedding1 jika perlu
    if len(embedding1_norm.shape) == 2 and embedding1_norm.shape[0] == 1:
        # Jika single embedding dalam bentuk batch (1, dim)
        similarities = tf.reduce_sum(embedding1_norm * embedding2_norm, axis=-1)
    else:
        # Jika multiple embeddings
        similarities = tf.matmul(embedding1_norm, tf.transpose(embedding2_norm))
        similarities = tf.reshape(similarities, [-1])
    
    # Convert to numpy and ensure values are between 0 and 1
    similarities_np = similarities.numpy()
    # Rescale dari [-1,1] ke [0,1]
    similarities_np = (similarities_np + 1) / 2
    
    print(f"Similarity values: {similarities_np}")
    print(f"Mean similarity: {np.mean(similarities_np)}")
    
    return similarities_np

# Generate embeddings untuk semua data
print("Generating embeddings for Student...")
student_embeddings_1 = generate_embeddings(std_tokens, bi_encoder_student_chatgpt)
print("Generating embeddings for ChatGPT (Student_ChatGPT)...")
chatgpt_embeddings_1 = generate_embeddings(gpt_tokens_1, bi_encoder_student_chatgpt)
print("Generating embeddings for ChatGPT (Only_ChatGPT)...")
chatgpt_embeddings_2 = generate_embeddings(gpt_tokens_2, bi_encoder_only_chatgpt)

# Hitung similarity scores
# 1. Similarity dengan Student (dari model Student_ChatGPT)
student_similarity_scores = []
for emb in student_embeddings_1:
    # Ambil rata-rata similarity dengan 70% student embedding acak
    sample_size = int(len(student_embeddings_1) * 0.7)  # 70% dari total data
    random_indices = np.random.choice(len(student_embeddings_1), sample_size, replace=False)
    random_embeddings = tf.gather(student_embeddings_1, random_indices)
    similarities = compute_similarity(tf.expand_dims(emb, 0), random_embeddings)
    student_similarity_scores.append(np.mean(similarities))

# 2. Similarity dengan ChatGPT (dari model Student_ChatGPT)
chatgpt1_similarity_scores = []
for emb in chatgpt_embeddings_1:
    # Ambil rata-rata similarity dengan 70% chatgpt embedding acak
    sample_size = int(len(chatgpt_embeddings_1) * 0.7)  # 70% dari total data
    random_indices = np.random.choice(len(chatgpt_embeddings_1), sample_size, replace=False)
    random_embeddings = tf.gather(chatgpt_embeddings_1, random_indices)
    similarities = compute_similarity(tf.expand_dims(emb, 0), random_embeddings)
    chatgpt1_similarity_scores.append(np.mean(similarities))

# 3. Similarity dengan ChatGPT Knowledge (dari model Only_ChatGPT)
chatgpt2_similarity_scores = []
for emb in chatgpt_embeddings_2:
    # Ambil rata-rata similarity dengan 70% chatgpt knowledge embedding acak
    sample_size = int(len(chatgpt_embeddings_2) * 0.7)  # 70% dari total data
    random_indices = np.random.choice(len(chatgpt_embeddings_2), sample_size, replace=False)
    random_embeddings = tf.gather(chatgpt_embeddings_2, random_indices)
    similarities = compute_similarity(tf.expand_dims(emb, 0), random_embeddings)
    chatgpt2_similarity_scores.append(np.mean(similarities))

# Konversi ke numpy arrays
student_similarity_scores = np.array(student_similarity_scores)
chatgpt1_similarity_scores = np.array(chatgpt1_similarity_scores)
chatgpt2_similarity_scores = np.array(chatgpt2_similarity_scores)

reference_embeddings = {
    'student': {
        'embeddings': student_embeddings_1.numpy(),
        'similarity_scores': student_similarity_scores
    },
    'chatgpt': {
        'embeddings': chatgpt_embeddings_1.numpy(),
        'similarity_scores': chatgpt1_similarity_scores
    },
    'chatgpt_knowledge': {
        'embeddings': chatgpt_embeddings_2.numpy(),
        'similarity_scores': chatgpt2_similarity_scores
    }
}


In [None]:
# Simpan model Bi-Encoder untuk student_chatgpt

# Simpan model Bi-Encoder untuk only_chatgpt

# Simpan tokenizer configuration

In [None]:
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model bi-encoder untuk Student_ChatGPT
bi_encoder_student_chatgpt.save('saved_models/bi_encoder_student_chatgpt.h5')

# Simpan model bi-encoder untuk Only_ChatGPT
bi_encoder_only_chatgpt.save('saved_models/bi_encoder_only_chatgpt.h5')

# Save tokenizer configuration
tokenizer.save_pretrained('saved_models/tokenizer')

with open('saved_models/reference_embeddings.pkl', 'wb') as f:
    pickle.dump(reference_embeddings, f)

print("Reference embeddings saved successfully!")

In [None]:
# Pada bagian klasifikasi, dataset dari examples dan knowledge digabung, lalu tiap data menghasilkan 2 embeddings dari model, 2 similarity score(1. lebih mirip Student atau ChatGPT, 2. Seberapa mirip dengan Knowledge/Pengetahuan ChatGPT), dan fitur stylometric 
#Dataset dibagi menjadi data training 70%, Validation 20%, Test 10%

In [None]:
# Fungsi untuk ekstraksi fitur stylometric
def extract_stylometric_features(text):
    """
    Ekstraksi fitur stylometric dari teks:
    - Panjang kata rata-rata
    - Rasio kata unik
    - Rasio tanda baca
    - Panjang kalimat
    
    Args:
        text (str): Input teks.
        
    Returns:
        dict: Fitur stylometric.
    """
    # Basic features
    n_chars = len(text)
    n_words = len(text.split())
    avg_word_length = n_chars / n_words if n_words > 0 else 0
    unique_word_ratio = len(set(text.split())) / n_words if n_words > 0 else 0
    
    # Syntactic features
    punctuation_ratio = len(re.findall(r'[.,!?;:]', text)) / n_chars if n_chars > 0 else 0
    
    return {
        'avg_word_length': avg_word_length,
        'unique_word_ratio': unique_word_ratio,
        'punctuation_ratio': punctuation_ratio,
        'sentence_length': n_words
    }

# Ekstraksi fitur untuk semua dataset
print("Ekstraksi fitur stylometric untuk Student...")
student_features = [extract_stylometric_features(text) for text in std_par]

print("Ekstraksi fitur stylometric untuk ChatGPT (Student_ChatGPT)...")
chatgpt_features_1 = [extract_stylometric_features(text) for text in gpt_par_1]

print("Ekstraksi fitur stylometric untuk ChatGPT (Only_ChatGPT)...")
chatgpt_features_2 = [extract_stylometric_features(text) for text in gpt_par_2]

# Konversi fitur ke DataFrame
student_features_df = pd.DataFrame(student_features)
chatgpt_features_1_df = pd.DataFrame(chatgpt_features_1)
chatgpt_features_2_df = pd.DataFrame(chatgpt_features_2)

# Tampilkan beberapa fitur hasil ekstraksi
print("\nFitur Stylometric Student:")
display(student_features_df.head())

print("\nFitur Stylometric ChatGPT (Student_ChatGPT):")
display(chatgpt_features_1_df.head())

print("\nFitur Stylometric ChatGPT (Only_ChatGPT):")
display(chatgpt_features_2_df.head())

In [None]:
# Gabungkan semua fitur untuk normalisasi
all_features = pd.concat([student_features_df, chatgpt_features_1_df, chatgpt_features_2_df], axis=0)

# Normalisasi fitur menggunakan StandardScaler
scaler = StandardScaler()
normalized_features = scaler.fit_transform(all_features)

# Pisahkan kembali fitur yang telah dinormalisasi
n_student = len(student_features_df)
n_chatgpt_1 = len(chatgpt_features_1_df)

student_features_normalized = normalized_features[:n_student]
chatgpt_features_1_normalized = normalized_features[n_student:n_student + n_chatgpt_1]
chatgpt_features_2_normalized = normalized_features[n_student + n_chatgpt_1:]

print("Fitur Student setelah normalisasi:")
print(student_features_normalized[:5])

# Simpan scaler untuk inference nanti
with open('scaler_stylometric.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
# Tambahkan label pada dataset untuk visualisasi
student_features_df['label'] = 'Esai Siswa'
chatgpt_features_1_df['label'] = 'Esai ChatGPT'
chatgpt_features_2_df['label'] = 'Pengetahuan ChatGPT'

# Gabungkan dataset
combined_features = pd.concat([student_features_df, chatgpt_features_1_df, chatgpt_features_2_df], axis=0)

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(['avg_word_length', 'unique_word_ratio', 'punctuation_ratio', 'sentence_length']):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='label', y=feature, data=combined_features)
    plt.title(f'Distribusi {feature}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 7))
sns.violinplot(x='label', y='avg_word_length', data=combined_features, inner='box')
plt.title('Distribusi Panjang Kata Rata-Rata')
plt.show()

In [None]:
sns.pairplot(combined_features, hue='label', vars=['avg_word_length', 'unique_word_ratio', 'punctuation_ratio', 'sentence_length'])
plt.show()

In [None]:
# Persiapkan similarity scores untuk input model
# Gabungkan similarity scores dari ketiga model
student_similarity_combined = np.column_stack([
    student_similarity_scores,
    np.zeros_like(student_similarity_scores),  # Placeholder untuk ChatGPT1
    np.zeros_like(student_similarity_scores)   # Placeholder untuk ChatGPT2
])

chatgpt1_similarity_combined = np.column_stack([
    np.zeros_like(chatgpt1_similarity_scores),  # Placeholder untuk Student
    chatgpt1_similarity_scores,
    chatgpt2_similarity_scores[:len(chatgpt1_similarity_scores)]  # Ambil sebanyak data ChatGPT1
])

# Input layers untuk tiga jenis fitur
bert_embedding_input = tf.keras.layers.Input(
    shape=(128,),  # Shape embeddings (diambil dari output model bi-encoder)
    dtype=tf.float32, 
    name="bert_embedding"
)
stylometric_input = tf.keras.layers.Input(
    shape=(4,),  # Shape jumlah fitur stylometric
    dtype=tf.float32, 
    name="stylometric_features"
)
similarity_score_input = tf.keras.layers.Input(
    shape=(3,),  # Shape score similarity (Student, ChatGPT1, ChatGPT2)
    dtype=tf.float32, 
    name="similarity_score"
)

# Dense layer untuk masing-masing input
bert_dense = tf.keras.layers.Dense(64, activation="relu")(bert_embedding_input)
style_dense = tf.keras.layers.Dense(16, activation="relu")(stylometric_input)
sim_dense = tf.keras.layers.Dense(8, activation="relu")(similarity_score_input)

# Gabungkan semua fitur
combined = tf.keras.layers.Concatenate()([bert_dense, style_dense, sim_dense])

# Dense layers untuk klasifikasi
x = tf.keras.layers.Dense(64, activation="relu")(combined)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

# Buat model classifier
classifier = tf.keras.Model(
    inputs=[bert_embedding_input, stylometric_input, similarity_score_input],
    outputs=output,
    name="text_classifier"
)

# Compile model
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

# Tampilkan arsitektur model
classifier.summary()

In [None]:
# Persiapkan embeddings untuk input model
# Ambil embeddings dari model bi-encoder
student_embeddings_reduced = student_embeddings_1.numpy()
chatgpt1_embeddings_reduced = chatgpt_embeddings_1.numpy()

# Label untuk data
student_labels = np.zeros(len(student_embeddings_reduced))  # Label 0 untuk Student
chatgpt_labels = np.ones(len(chatgpt1_embeddings_reduced))  # Label 1 untuk ChatGPT

# Combine embedding data
all_embeddings = np.vstack([student_embeddings_reduced, chatgpt1_embeddings_reduced])

# Combine stylometric features
all_stylometric_features = np.vstack([student_features_normalized, chatgpt_features_1_normalized])

# Combine similarity scores
all_similarity_scores = np.vstack([student_similarity_combined, chatgpt1_similarity_combined])

# Combine labels
all_labels = np.hstack([student_labels, chatgpt_labels])

# Split data into training, validation, and test sets
train_idx, temp_idx = train_test_split(
    np.arange(len(all_labels)),
    test_size=0.3,
    random_state=42,
    stratify=all_labels
)

val_idx, test_idx = train_test_split(
    temp_idx,
    test_size=0.33,  # 10% of total (0.33 * 0.3 = 0.1)
    random_state=42,
    stratify=all_labels[temp_idx]
)

# Prepare inputs for training, validation, and test
train_inputs = {
    "bert_embedding": all_embeddings[train_idx],
    "stylometric_features": all_stylometric_features[train_idx],
    "similarity_score": all_similarity_scores[train_idx]
}

val_inputs = {
    "bert_embedding": all_embeddings[val_idx],
    "stylometric_features": all_stylometric_features[val_idx],
    "similarity_score": all_similarity_scores[val_idx]
}

test_inputs = {
    "bert_embedding": all_embeddings[test_idx],
    "stylometric_features": all_stylometric_features[test_idx],
    "similarity_score": all_similarity_scores[test_idx]
}

train_labels = all_labels[train_idx]
val_labels = all_labels[val_idx]
test_labels = all_labels[test_idx]

print(f"Training data: {len(train_labels)}")
print(f"Validation data: {len(val_labels)}")
print(f"Test data: {len(test_labels)}")


In [None]:
# Training classifier
print("Training Text Classifier...")
history_classifier = classifier.fit(
    train_inputs,
    train_labels,
    validation_data=(val_inputs, val_labels),
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_classifier.history['loss'], label='Training Loss')
plt.plot(history_classifier.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_classifier.history['accuracy'], label='Training Accuracy')
plt.plot(history_classifier.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Evaluasi model pada validation set
val_loss, val_acc, val_precision, val_recall, val_auc = classifier.evaluate(val_inputs, val_labels)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

# Prediksi pada validation set
val_predictions = classifier.predict(val_inputs)
val_predictions_binary = (val_predictions > 0.5).astype(int)

# Confusion matrix
cm = confusion_matrix(val_labels, val_predictions_binary)

# Visualisasi confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Student', 'ChatGPT'], yticklabels=['Student', 'ChatGPT'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_predictions_binary, target_names=['Student', 'ChatGPT']))

# Evaluasi model pada test set
test_loss, test_acc, test_precision, test_recall, test_auc = classifier.evaluate(test_inputs, test_labels)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test AUC: {test_auc:.4f}")

# Prediksi pada test set
test_predictions = classifier.predict(test_inputs)
test_predictions_binary = (test_predictions > 0.5).astype(int)

# Confusion matrix untuk test set
cm_test = confusion_matrix(test_labels, test_predictions_binary)

# Visualisasi confusion matrix untuk test set
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['Student', 'ChatGPT'], yticklabels=['Student', 'ChatGPT'])
plt.title('Test Set Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report untuk test set
print("\nTest Set Classification Report:")
print(classification_report(test_labels, test_predictions_binary, target_names=['Student', 'ChatGPT']))


In [None]:
# Buat direktori jika belum ada
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model bi-encoder untuk Student_ChatGPT
bi_encoder_student_chatgpt.save('saved_models/bi_encoder_student_chatgpt.h5')

# Simpan model bi-encoder untuk Only_ChatGPT
bi_encoder_only_chatgpt.save('saved_models/bi_encoder_only_chatgpt.h5')

# Simpan model classifier
classifier.save('saved_models/text_classifier.h5')

# Simpan konfigurasi tokenizer
tokenizer.save_pretrained('saved_models/tokenizer')

# Simpan scaler untuk fitur stylometric
with open("saved_models/scaler_stylometric.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Model dan konfigurasi berhasil disimpan!")

In [None]:
#Inferensi Model
#Import semua package yang dibutuhkan
#Load semua model yang dibutuhkan
#Run