In [None]:
#Import Library

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import pickle
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load both datasets
student_chatgpt = pd.read_excel("example_datasets/examples-datasets-mar25.xlsx")
only_chatgpt = pd.read_excel("example_datasets/knowledge-datasets-mar25.xlsx")

In [None]:
# Tampilkan informasi dataset
print("Dataset Student_ChatGPT:")
student_chatgpt.info()
print("\nDataset Only_ChatGPT:")
only_chatgpt.info()

# Tampilkan beberapa baris awal dataset
print("\nContoh Data Student_ChatGPT:")
student_chatgpt.head()

print("\nContoh Data Only_ChatGPT:")
only_chatgpt.head()

In [None]:
def preprocess_text(text):
    """
    Preprocessing teks:
    - Mengubah teks menjadi huruf kecil
    - Membersihkan whitespace berlebih
    - Menjaga teks sebagai paragraf utuh
    
    Args:
        text (str): Teks input.
        
    Returns:
        str: Teks yang telah diproses atau None jika tidak valid.
    """
    if not isinstance(text, str) or text.strip() == "":
        return None
    
    # Bersihkan teks
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Ganti multiple whitespace dengan satu spasi
    return text

In [None]:
# Preprocessing data Student_ChatGPT
std_par = []
gpt_par_1 = []  # ChatGPT dari dataset Student_ChatGPT

# Proses teks Student
for text in student_chatgpt['Pelajar']:
    processed = preprocess_text(text)
    if processed:
        std_par.append(processed)

# Proses teks ChatGPT (dari Student_ChatGPT)
for text in student_chatgpt['GPT']:
    processed = preprocess_text(text)
    if processed:
        gpt_par_1.append(processed)

# Preprocessing data Only_ChatGPT
gpt_par_2 = []  # ChatGPT dari dataset Only_ChatGPT

# Proses teks ChatGPT (dari Only_ChatGPT)
for text in only_chatgpt['GPT']:
    processed = preprocess_text(text)
    if processed:
        gpt_par_2.append(processed)

# Tampilkan jumlah data hasil preprocessing
print(f"Total paragraf Student: {len(std_par)}")
print(f"Total paragraf ChatGPT (Student_ChatGPT): {len(gpt_par_1)}")
print(f"Total paragraf ChatGPT (Only_ChatGPT): {len(gpt_par_2)}")

In [None]:
# Inisialisasi tokenizer IndoBERT
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Fungsi untuk tokenisasi teks
def tokenize_text(texts, max_length=512):
    """
    Tokenisasi teks menggunakan tokenizer IndoBERT.
    
    Args:
        texts (list): Daftar teks yang akan di-tokenisasi.
        max_length (int): Panjang maksimum token.
        
    Returns:
        dict: Token hasil tokenisasi, termasuk input_ids dan attention_mask.
    """
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

In [None]:
# Tokenisasi data
print("Tokenisasi kalimat Student...")
student_tokens = tokenize_text(std_par)

print("Tokenisasi kalimat ChatGPT (Student_ChatGPT)...")
chatgpt_tokens = tokenize_text(gpt_par_1 )

print("Tokenisasi kalimat ChatGPT (Only_ChatGPT)...")
chatgpt_tokens_2 = tokenize_text(gpt_par_2)

# Tampilkan hasil tokenisasi (contoh: Student)
print("\nContoh hasil tokenisasi:")
print(student_tokens['input_ids'][:1])  # Input token ID
print(student_tokens['attention_mask'][:1])  # Attention mask

In [None]:
# Memeriksa distribusi panjang token untuk memastikan max_length cukup
student_lengths = [sum(mask) for mask in student_tokens['attention_mask'].numpy()]
plt.figure(figsize=(10, 6))
plt.hist(student_lengths, bins=30)
plt.title('Distribusi Panjang Token (Student)')
plt.xlabel('Jumlah Token Aktif')
plt.ylabel('Frekuensi')
plt.show()
print(f"Rata-rata panjang token: {np.mean(student_lengths):.2f}")
print(f"Persentase terpotong: {sum(l == 128 for l in student_lengths) / len(student_lengths) * 100:.2f}%")

In [None]:
# Mendekode token untuk memastikan tokenisasi berfungsi dengan baik
sample_text = std_par[0]
sample_tokens = tokenizer.encode(sample_text)
print(f"Teks asli: {sample_text}")
print(f"Token ID: {sample_tokens}")
print(f"Token dekode: {tokenizer.decode(sample_tokens)}")

In [None]:
# Jika tokenisasi memakan waktu lama, pertimbangkan untuk menyimpannya
tokenized_data = {
    'student': student_tokens,
    'chatgpt ': chatgpt_tokens ,
    'chatgpt_2': chatgpt_tokens_2
}

# Menyimpan input_ids dan attention_mask sebagai numpy arrays
tokenized_numpy = {
    'student': {
        'input_ids': student_tokens['input_ids'].numpy(),
        'attention_mask': student_tokens['attention_mask'].numpy()
    },
    'chatgpt ': {
        'input_ids': chatgpt_tokens ['input_ids'].numpy(),
        'attention_mask': chatgpt_tokens ['attention_mask'].numpy()
    },
    'chatgpt_2': {
        'input_ids': chatgpt_tokens_2['input_ids'].numpy(),
        'attention_mask': chatgpt_tokens_2['attention_mask'].numpy()
    }
}

with open('tokenized_data.pkl', 'wb') as f:
    pickle.dump(tokenized_numpy, f)

In [None]:
# Load model IndoBERT
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

#Buat Model Bi-Encoder Student_ChatGPT

#Buat Model Bi-Encoder Only_ChatGPT

In [None]:
# Buat Model Bi-Encoder Student_ChatGPT
def create_bi_encoder(bert_model):
    # Input layers
    input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")
    
    # BERT layer
    bert_outputs = bert_model([input_ids, attention_mask])
    pooled_output = bert_outputs[1]  # Use the pooled output for sentence embedding
    
    # Dense layers for embedding
    embedding = tf.keras.layers.Dense(512, activation="tanh")(pooled_output)
    embedding = tf.keras.layers.Dropout(0.1)(embedding)
    embedding = tf.keras.layers.Dense(512, activation=None)(embedding)
    
    # Normalize embeddings
    embedding = tf.keras.layers.Lambda(
        lambda x: tf.nn.l2_normalize(x, axis=1)
    )(embedding)
    
    # Create model
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=embedding,
        name="bi_encoder"
    )
    
    return model

# Create Bi-Encoder for Student_ChatGPT
bi_encoder_student_chatgpt = create_bi_encoder(bert_model)

# Buat Model Bi-Encoder Only_ChatGPT
bi_encoder_only_chatgpt = create_bi_encoder(bert_model)

In [None]:
# Contrastive loss function
def contrastive_loss(y_true, y_pred):
    margin = 1.0
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * square_pred + (1 - y_true) * margin_square)

In [None]:
# Function to create training pairs
def create_training_pairs(texts1, texts2, is_positive=True):
    pairs = []
    labels = []
    
    # Create pairs within the same dataset
    for i in range(len(texts1)):
        for j in range(i+1, len(texts1)):
            pairs.append((texts1[i], texts1[j]))
            labels.append(1)  # Positive pair
    
    # Create pairs between datasets
    if texts2 is not None:
        for i in range(len(texts1)):
            for j in range(len(texts2)):
                pairs.append((texts1[i], texts2[j]))
                labels.append(1 if is_positive else 0)  # Positive or negative pair
    
    return pairs, labels

In [None]:
# Create training pairs for Student_ChatGPT
# Positive pairs: Student-Student, ChatGPT-ChatGPT
student_student_pairs, student_student_labels = create_training_pairs(std_par, None)
chatgpt_chatgpt_pairs, chatgpt_chatgpt_labels = create_training_pairs(gpt_par_1, None)

# Negative pairs: Student-ChatGPT
student_chatgpt_pairs, student_chatgpt_labels = create_training_pairs(std_par, gpt_par_1, is_positive=False)

# Tampilkan jumlah pasangan data
print(f"Jumlah pasangan Student-Student (positif): {len(student_student_pairs)}")
print(f"Jumlah pasangan ChatGPT-ChatGPT (positif): {len(chatgpt_chatgpt_pairs)}")
print(f"Jumlah pasangan Student-ChatGPT (negatif): {len(student_chatgpt_pairs)}")

# Combine all pairs for Student_ChatGPT model
all_pairs_student_chatgpt = student_student_pairs + chatgpt_chatgpt_pairs + student_chatgpt_pairs
all_labels_student_chatgpt = student_student_labels + chatgpt_chatgpt_labels + student_chatgpt_labels


print(f"Total jumlah pasangan data untuk model Student_ChatGPT: {len(all_pairs_student_chatgpt)}")
print(f"Distribusi label: Positif = {sum(all_labels_student_chatgpt)}, Negatif = {len(all_labels_student_chatgpt) - sum(all_labels_student_chatgpt)}")

# Training Datasets Student_ChatGPT
# Tokenize pairs
input_ids_1 = []
attention_mask_1 = []
input_ids_2 = []
attention_mask_2 = []

for pair in all_pairs_student_chatgpt:
    tokens1 = tokenize_text([pair[0]])
    tokens2 = tokenize_text([pair[1]])
    
    input_ids_1.append(tokens1['input_ids'][0])
    attention_mask_1.append(tokens1['attention_mask'][0])
    input_ids_2.append(tokens2['input_ids'][0])
    attention_mask_2.append(tokens2['attention_mask'][0])

# Convert to tensors
input_ids_1 = tf.stack(input_ids_1)
attention_mask_1 = tf.stack(attention_mask_1)
input_ids_2 = tf.stack(input_ids_2)
attention_mask_2 = tf.stack(attention_mask_2)
labels = tf.convert_to_tensor(all_labels_student_chatgpt, dtype=tf.float32)

In [None]:
# Create training pairs for Student_ChatGPT
# Positive pairs: Student-Student, ChatGPT-ChatGPT
student_student_pairs, student_student_labels = create_training_pairs(std_par, None)
chatgpt_chatgpt_pairs, chatgpt_chatgpt_labels = create_training_pairs(gpt_par_1, None)

# Negative pairs: Student-ChatGPT
student_chatgpt_pairs, student_chatgpt_labels = create_training_pairs(std_par, gpt_par_1, is_positive=False)

# Tampilkan jumlah pasangan data
print(f"Jumlah pasangan Student-Student (positif): {len(student_student_pairs)}")
print(f"Jumlah pasangan ChatGPT-ChatGPT (positif): {len(chatgpt_chatgpt_pairs)}")
print(f"Jumlah pasangan Student-ChatGPT (negatif): {len(student_chatgpt_pairs)}")

# Combine all pairs for Student_ChatGPT model
all_pairs_student_chatgpt = student_student_pairs + chatgpt_chatgpt_pairs + student_chatgpt_pairs
all_labels_student_chatgpt = student_student_labels + chatgpt_chatgpt_labels + student_chatgpt_labels

print(f"Total jumlah pasangan data untuk model Student_ChatGPT: {len(all_pairs_student_chatgpt)}")
print(f"Distribusi label: Positif = {sum(all_labels_student_chatgpt)}, Negatif = {len(all_labels_student_chatgpt) - sum(all_labels_student_chatgpt)}")

# Training Datasets Student_ChatGPT
# Tokenize pairs
input_ids_1 = []
attention_mask_1 = []
input_ids_2 = []
attention_mask_2 = []

# Variabel untuk menghitung pasangan yang terpotong
truncated_pairs_count = 0

for pair in all_pairs_student_chatgpt:
    tokens1 = tokenize_text([pair[0]])
    tokens2 = tokenize_text([pair[1]])
    
    # Periksa apakah ada token yang terpotong (mencapai max_length)
    if sum(tokens1['attention_mask'][0]) == 512 or sum(tokens2['attention_mask'][0]) == 512:
        truncated_pairs_count += 1
    
    input_ids_1.append(tokens1['input_ids'][0])
    attention_mask_1.append(tokens1['attention_mask'][0])
    input_ids_2.append(tokens2['input_ids'][0])
    attention_mask_2.append(tokens2['attention_mask'][0])

print(f"Jumlah pasangan yang terpotong (mencapai max_length 512): {truncated_pairs_count}")
print(f"Persentase pasangan yang terpotong: {truncated_pairs_count / len(all_pairs_student_chatgpt) * 100:.2f}%")

# Convert to tensors
input_ids_1 = tf.stack(input_ids_1)
attention_mask_1 = tf.stack(attention_mask_1)
input_ids_2 = tf.stack(input_ids_2)
attention_mask_2 = tf.stack(attention_mask_2)
labels = tf.convert_to_tensor(all_labels_student_chatgpt, dtype=tf.float32)


In [None]:
# Create Siamese network for Student_ChatGPT
input_ids_a = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids_a")
attention_mask_a = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask_a")
input_ids_b = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids_b")
attention_mask_b = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask_b")

# Get embeddings
embedding_a = bi_encoder_student_chatgpt([input_ids_a, attention_mask_a])
embedding_b = bi_encoder_student_chatgpt([input_ids_b, attention_mask_b])

# Calculate cosine similarity
similarity = tf.reduce_sum(embedding_a * embedding_b, axis=1)

# Create model
siamese_model_student_chatgpt = tf.keras.Model(
    inputs=[input_ids_a, attention_mask_a, input_ids_b, attention_mask_b],
    outputs=similarity
)

In [None]:
# Compile model
siamese_model_student_chatgpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=contrastive_loss,
    metrics=['accuracy']
)

# Train model
history_student_chatgpt = siamese_model_student_chatgpt.fit(
    [input_ids_1, attention_mask_1, input_ids_2, attention_mask_2],
    labels,
    batch_size=16,
    epochs=3,
    validation_split=0.2
)

In [None]:
# Contrastive loss digunakan untuk melatih Bi-Encoder dengan tujuan:
# - Pasangan kalimat yang mirip memiliki nilai similarity tinggi (loss rendah).
# - Pasangan kalimat yang tidak mirip memiliki nilai similarity rendah (loss tinggi).

# Datasets Student_ChatGPT: variable -> std_par & gpt_par_1
# Pasangan Positif:
# - Student - Student Dalam satu data, antar kalimat
# - Student - Student Beda data, Antar kalimat
# - ChatGPT - ChatGPT Dalam satu data, antar kalimat
# - ChatGPT - ChatGPT Beda data, Antar kalimat

# Pasangan Negatif:
# - Student - ChatGPT Beda data, Antar kalimat

#Training Datasets Student_ChatGPT (Gunakan Model Fine-Tuning IndoBERT di atas)

In [None]:
# Create training pairs for Only_ChatGPT
# Positive pairs: ChatGPT-ChatGPT
chatgpt2_chatgpt2_pairs, chatgpt2_chatgpt2_labels = create_training_pairs(gpt_par_2, None)

# Tampilkan jumlah pasangan data
print(f"Jumlah pasangan ChatGPT-ChatGPT (Only_ChatGPT, positif): {len(chatgpt2_chatgpt2_pairs)}")
print(f"Total jumlah pasangan data untuk model Only_ChatGPT: {len(chatgpt2_chatgpt2_pairs)}")
print(f"Distribusi label: Positif = {sum(chatgpt2_chatgpt2_labels)}, Negatif = {len(chatgpt2_chatgpt2_labels) - sum(chatgpt2_chatgpt2_labels)}")

# Tokenize pairs
input_ids_1_only = []
attention_mask_1_only = []
input_ids_2_only = []
attention_mask_2_only = []

# Variabel untuk menghitung pasangan yang terpotong
truncated_pairs_count_only = 0

for pair in chatgpt2_chatgpt2_pairs:
    tokens1 = tokenize_text([pair[0]])
    tokens2 = tokenize_text([pair[1]])
    
    # Periksa apakah ada token yang terpotong (mencapai max_length)
    if sum(tokens1['attention_mask'][0]) == 512 or sum(tokens2['attention_mask'][0]) == 512:
        truncated_pairs_count_only += 1
    
    input_ids_1_only.append(tokens1['input_ids'][0])
    attention_mask_1_only.append(tokens1['attention_mask'][0])
    input_ids_2_only.append(tokens2['input_ids'][0])
    attention_mask_2_only.append(tokens2['attention_mask'][0])

print(f"Jumlah pasangan yang terpotong (mencapai max_length 512): {truncated_pairs_count_only}")
print(f"Persentase pasangan yang terpotong: {truncated_pairs_count_only / len(chatgpt2_chatgpt2_pairs) * 100:.2f}%")

# Convert to tensors
input_ids_1_only = tf.stack(input_ids_1_only)
attention_mask_1_only = tf.stack(attention_mask_1_only)
input_ids_2_only = tf.stack(input_ids_2_only)
attention_mask_2_only = tf.stack(attention_mask_2_only)
labels_only = tf.convert_to_tensor(chatgpt2_chatgpt2_labels, dtype=tf.float32)

In [None]:
# Create Siamese network for Only_ChatGPT
input_ids_a_only = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids_a_only")
attention_mask_a_only = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask_a_only")
input_ids_b_only = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids_b_only")
attention_mask_b_only = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask_b_only")

# Get embeddings
embedding_a_only = bi_encoder_only_chatgpt([input_ids_a_only, attention_mask_a_only])
embedding_b_only = bi_encoder_only_chatgpt([input_ids_b_only, attention_mask_b_only])

# Calculate cosine similarity
similarity_only = tf.reduce_sum(embedding_a_only * embedding_b_only, axis=1)

In [None]:
# Create model
siamese_model_only_chatgpt = tf.keras.Model(
    inputs=[input_ids_a_only, attention_mask_a_only, input_ids_b_only, attention_mask_b_only],
    outputs=similarity_only
)

# Compile model
siamese_model_only_chatgpt.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=contrastive_loss,
    metrics=['accuracy']
)

# Train model
history_only_chatgpt = siamese_model_only_chatgpt.fit(
    [input_ids_1_only, attention_mask_1_only, input_ids_2_only, attention_mask_2_only],
    labels_only,
    batch_size=16,
    epochs=3,
    validation_split=0.2
)

In [None]:
# Datasets Only_ChatGPT: variable -> gpt_par_2
# Pasangan Positif:
# - ChatGPT - ChatGPT Dalam satu data, antar kalimat
# - ChatGPT - ChatGPT Beda data, Antar kalimat


#Training Datasets Only_ChatGPT (Self-Supervised, Gunakan Model Fine-Tuning IndoBERT di atas)

In [None]:
# Plot training history

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

# Plot Student_ChatGPT model history
plt.subplot(1, 2, 1)
plt.plot(history_student_chatgpt.history['loss'], label='Train Loss')
plt.plot(history_student_chatgpt.history['val_loss'], label='Val Loss')
plt.title('Student_ChatGPT Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot Only_ChatGPT model history
plt.subplot(1, 2, 2)
plt.plot(history_only_chatgpt.history['loss'], label='Train Loss')
plt.plot(history_only_chatgpt.history['val_loss'], label='Val Loss')
plt.title('Only_ChatGPT Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Generate embeddings untuk kalimat Student, ChatGPT_1 dan ChatGPT_2

# Fungsi untuk menghitung similarity score

# Hitung similarity scores X-Student (Model Student_ChatGPT), X-ChatGPT1 (Model Student_ChatGPT), X-ChatGPT2 (Model Only_ChatGPT)

In [None]:
# Generate embeddings for Student, ChatGPT_1 and ChatGPT_2
def generate_embeddings(texts, model, tokenizer):
    embeddings = []
    batch_size = 16
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenize_text(batch_texts)
        batch_embeddings = model([tokens['input_ids'], tokens['attention_mask']])
        embeddings.append(batch_embeddings)
    
    return tf.concat(embeddings, axis=0)

# Generate embeddings
student_embeddings = generate_embeddings(std_par, bi_encoder_student_chatgpt, tokenizer)
chatgpt1_embeddings = generate_embeddings(gpt_par_1, bi_encoder_student_chatgpt, tokenizer)
chatgpt2_embeddings = generate_embeddings(gpt_par_2, bi_encoder_only_chatgpt, tokenizer)

In [None]:
# Function to compute similarity scores
def compute_similarity(embedding, reference_embeddings):
    # Compute cosine similarity between embedding and all reference embeddings
    similarities = tf.matmul(embedding, tf.transpose(reference_embeddings))
    # Return the maximum similarity score
    return tf.reduce_max(similarities, axis=1)

# Compute similarity scores for all texts
student_similarity_scores = []
chatgpt1_similarity_scores = []
chatgpt2_similarity_scores = []

In [None]:
# For each text, compute similarity with student, chatgpt1, and chatgpt2 references
for i in range(len(std_par)):
    embedding = tf.reshape(student_embeddings[i], [1, -1])
    
    # Similarity with student texts
    student_sim = compute_similarity(embedding, student_embeddings)
    # Similarity with chatgpt1 texts
    chatgpt1_sim = compute_similarity(embedding, chatgpt1_embeddings)
    # Similarity with chatgpt2 texts
    chatgpt2_sim = compute_similarity(embedding, chatgpt2_embeddings)
    
    student_similarity_scores.append([student_sim.numpy()[0], chatgpt1_sim.numpy()[0], chatgpt2_sim.numpy()[0]])

for i in range(len(gpt_par_1)):
    embedding = tf.reshape(chatgpt1_embeddings[i], [1, -1])
    
    # Similarity with student texts
    student_sim = compute_similarity(embedding, student_embeddings)
    # Similarity with chatgpt1 texts
    chatgpt1_sim = compute_similarity(embedding, chatgpt1_embeddings)
    # Similarity with chatgpt2 texts
    chatgpt2_sim = compute_similarity(embedding, chatgpt2_embeddings)
    
    chatgpt1_similarity_scores.append([student_sim.numpy()[0], chatgpt1_sim.numpy()[0], chatgpt2_sim.numpy()[0]])

for i in range(len(gpt_par_2)):
    embedding = tf.reshape(chatgpt2_embeddings[i], [1, -1])
    
    # Similarity with student texts
    student_sim = compute_similarity(embedding, student_embeddings)
    # Similarity with chatgpt1 texts
    chatgpt1_sim = compute_similarity(embedding, chatgpt1_embeddings)
    # Similarity with chatgpt2 texts
    chatgpt2_sim = compute_similarity(embedding, chatgpt2_embeddings)
    
    chatgpt2_similarity_scores.append([student_sim.numpy()[0], chatgpt1_sim.numpy()[0], chatgpt2_sim.numpy()[0]])

# Convert to numpy arrays
student_similarity_scores = np.array(student_similarity_scores)
chatgpt1_similarity_scores = np.array(chatgpt1_similarity_scores)
chatgpt2_similarity_scores = np.array(chatgpt2_similarity_scores)

In [None]:
# Simpan model Bi-Encoder untuk student_chatgpt

# Simpan model Bi-Encoder untuk only_chatgpt

# Simpan tokenizer configuration

In [None]:
# Save Bi-Encoder model for student_chatgpt
bi_encoder_student_chatgpt.save('saved_models/bi_encoder_student_chatgpt')

# Save Bi-Encoder model for only_chatgpt
bi_encoder_only_chatgpt.save('saved_models/bi_encoder_only_chatgpt')

# Save tokenizer configuration
tokenizer.save_pretrained('saved_models/tokenizer')

In [None]:
# Fungsi untuk ekstraksi fitur stylometric
def extract_stylometric_features(text):
    """
    Ekstraksi fitur stylometric dari teks:
    - Panjang kata rata-rata
    - Rasio kata unik
    - Rasio tanda baca
    
     Args:
        text (str): Input teks.
        
    Returns:
        dict: Fitur stylometric.
    """
    # Basic features
    n_chars = len(text)
    n_words = len(text.split())
    avg_word_length = n_chars / n_words if n_words > 0 else 0
    unique_word_ratio = len(set(text.split())) / n_words if n_words > 0 else 0
    
    # Syntactic features
    punctuation_ratio = len(re.findall(r'[.,!?;:]', text)) / n_chars if n_chars > 0 else 0
    
    return {
        'avg_word_length': avg_word_length,
        'unique_word_ratio': unique_word_ratio,
        'punctuation_ratio': punctuation_ratio,
        'sentence_length': n_words
    }

# Ekstraksi fitur untuk semua dataset
print("Ekstraksi fitur stylometric untuk Student...")
student_features = [extract_stylometric_features(text) for text in std_par]

print("Ekstraksi fitur stylometric untuk ChatGPT (Student_ChatGPT)...")
chatgpt_features_1 = [extract_stylometric_features(text) for text in gpt_par_1]

print("Ekstraksi fitur stylometric untuk ChatGPT (Only_ChatGPT)...")
chatgpt_features_2 = [extract_stylometric_features(text) for text in gpt_par_2]

# Konversi fitur ke DataFrame
student_features_df = pd.DataFrame(student_features)
chatgpt_features_1_df = pd.DataFrame(chatgpt_features_1)
chatgpt_features_2_df = pd.DataFrame(chatgpt_features_2)

# Tampilkan beberapa fitur hasil ekstraksi
print("\nFitur Stylometric Student:")
display(student_features_df.head())

print("\nFitur Stylometric ChatGPT (Student_ChatGPT):")
display(chatgpt_features_1_df.head())

print("\nFitur Stylometric ChatGPT (Only_ChatGPT):")
display(chatgpt_features_2_df.head())

In [None]:
# Gabungkan semua fitur untuk normalisasi
all_features = pd.concat([student_features_df, chatgpt_features_1_df, chatgpt_features_2_df], axis=0)

# Normalisasi fitur menggunakan StandardScaler
scaler = StandardScaler()
normalized_features = scaler.fit_transform(all_features)

# Pisahkan kembali fitur yang telah dinormalisasi
n_student = len(student_features_df)
n_chatgpt_1 = len(chatgpt_features_1_df)

student_features_normalized = normalized_features[:n_student]
chatgpt_features_1_normalized = normalized_features[n_student:n_student + n_chatgpt_1]
chatgpt_features_2_normalized = normalized_features[n_student + n_chatgpt_1:]

print("Fitur Student setelah normalisasi:")
print(student_features_normalized[:5])

# Simpan scaler untuk inference nanti
with open('scaler_stylometric.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
# Tambahkan label pada dataset untuk visualisasi
student_features_df['label'] = 'Esai Siswa'
chatgpt_features_1_df['label'] = 'Esai ChatGPT'
chatgpt_features_2_df['label'] = 'Pengetahuan ChatGPT'

# Gabungkan dataset
combined_features = pd.concat([student_features_df, chatgpt_features_1_df, chatgpt_features_2_df], axis=0)

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(['avg_word_length', 'unique_word_ratio', 'punctuation_ratio', 'sentence_length']):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='label', y=feature, data=combined_features)
    plt.title(f'Distribusi {feature}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 7))
sns.violinplot(x='label', y='avg_word_length', data=combined_features, inner='box')
plt.title('Distribusi Panjang Kata Rata-Rata')
plt.show()

In [None]:
sns.pairplot(combined_features, hue='label', vars=['avg_word_length', 'unique_word_ratio', 'punctuation_ratio', 'sentence_length'])
plt.show()

In [None]:
# Input layers untuk tiga jenis fitur
bert_embedding_input = tf.keras.layers.Input(
    shape=(512,), #Shape Embeddings
    dtype=tf.float32, 
    name="bert_embedding"
)
stylometric_input = tf.keras.layers.Input(
    shape=(4,), #Shape Jumlah fitur Stylometric
    dtype=tf.float32, 
    name="stylometric_features"
)
similarity_score_input = tf.keras.layers.Input(
    shape=(3,), #Shape Score similarity from Student, ChatGPT1, ChatGPT2
    dtype=tf.float32, 
    name="similarity_score"
)

# Dense layer untuk masing-masing input
bert_dense = tf.keras.layers.Dense(64, activation="relu")(bert_embedding_input)
style_dense = tf.keras.layers.Dense(16, activation="relu")(stylometric_input)
sim_dense = tf.keras.layers.Dense(8, activation="relu")(similarity_score_input)

# Gabungkan semua fitur
combined = tf.keras.layers.Concatenate()([bert_dense, style_dense, sim_dense])

# Dense layers untuk klasifikasi
x = tf.keras.layers.Dense(64, activation="relu")(combined)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

# Buat model classifier
classifier = tf.keras.Model(
    inputs=[bert_embedding_input, stylometric_input, similarity_score_input],
    outputs=output,
    name="single_sentence_classifier"
)

# Compile model
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

classifier.summary()

In [None]:
# Label untuk kalimat
student_labels = np.zeros(len(student_embeddings))  # Label 0 for Student
chatgpt_labels = np.ones(len(chatgpt1_embeddings))  # Label 1 for ChatGPT

# Combine embedding data
all_embeddings = np.vstack([student_embeddings.numpy(), chatgpt1_embeddings.numpy()])

# Combine stylometric features
all_stylometric_features = np.vstack([student_features_normalized, chatgpt_features_1_normalized])

# Pastikan semua data memiliki ukuran yang sama
n_student = len(student_embeddings)
n_chatgpt = len(chatgpt1_embeddings)

# Create similarity scores array
all_similarity_scores = np.vstack([student_similarity_scores, chatgpt1_similarity_scores])

all_labels = np.hstack([student_labels, chatgpt_labels])

# Split data into training and validation
train_idx, val_idx = train_test_split(
    np.arange(len(all_labels)),
    test_size=0.2,
    random_state=42,
    stratify=all_labels
)

# Prepare inputs for training and validation
train_inputs = {
    "bert_embedding": all_embeddings[train_idx],
    "stylometric_features": all_stylometric_features[train_idx],
    "similarity_score": all_similarity_scores[train_idx]
}
val_inputs = {
    "bert_embedding": all_embeddings[val_idx],
    "stylometric_features": all_stylometric_features[val_idx],
    "similarity_score": all_similarity_scores[val_idx]
}

train_labels_split = all_labels[train_idx]
val_labels_split = all_labels[val_idx]

In [None]:
# Training classifier
print("Training Single Sentence Classifier...")
history = classifier.fit(
    train_inputs,
    train_labels_split,
    validation_data=(val_inputs, val_labels_split),
    epochs=20,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ]
)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Prediksi pada validation set
val_predictions = classifier.predict(val_inputs)
val_predictions_binary = (val_predictions > 0.5).astype(int)

# Confusion matrix
cm = confusion_matrix(val_labels_split, val_predictions_binary)

# Visualisasi confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Student', 'ChatGPT'], yticklabels=['Student', 'ChatGPT'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(val_labels_split, val_predictions_binary, target_names=['Student', 'ChatGPT']))

In [None]:
# Buat direktori jika belum ada
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model classifier
classifier.save('saved_models/single_sentence_classifier.h5')

# Simpan konfigurasi tokenizer
tokenizer.save_pretrained('saved_models/tokenizer')

# Simpan scaler untuk fitur stylometric
with open("scaler_stylometric.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Model dan konfigurasi berhasil disimpan!")

In [None]:
def load_models():
    """
    Load all saved models and configurations
    """
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('saved_models/tokenizer')
    
    # Load bi-encoder models
    bi_encoder_student_chatgpt = tf.keras.models.load_model('saved_models/bi_encoder_student_chatgpt')
    bi_encoder_only_chatgpt = tf.keras.models.load_model('saved_models/bi_encoder_only_chatgpt')
    
    # Load classifier
    classifier = tf.keras.models.load_model('saved_models/single_sentence_classifier.h5')
    
    # Load stylometric scaler
    with open('scaler_stylometric.pkl', 'rb') as f:
        scaler = pickle.load(f)
    
    return tokenizer, bi_encoder_student_chatgpt, bi_encoder_only_chatgpt, classifier, scaler

def predict_text_source(text, tokenizer, bi_encoder_student_chatgpt, bi_encoder_only_chatgpt, classifier, scaler, 
                        reference_student_embeddings, reference_chatgpt1_embeddings, reference_chatgpt2_embeddings):
    """
    Predict if a text is written by a student or ChatGPT
    
    Args:
        text (str): Input text
        tokenizer: BERT tokenizer
        bi_encoder_student_chatgpt: Bi-encoder model for student-chatgpt
        bi_encoder_only_chatgpt: Bi-encoder model for only-chatgpt
        classifier: Final classifier model
        scaler: Stylometric features scaler
        reference_embeddings: Reference embeddings for similarity calculation
        
    Returns:
        dict: Prediction results including probability and classification
    """
    # Preprocess text
    processed_text = preprocess_text(text)
    if not processed_text:
        return {"error": "Invalid input text"}
    
    # Tokenize text
    tokens = tokenize_text([processed_text])
    
    # Generate embeddings
    embedding_student_chatgpt = bi_encoder_student_chatgpt([tokens['input_ids'], tokens['attention_mask']])
    embedding_only_chatgpt = bi_encoder_only_chatgpt([tokens['input_ids'], tokens['attention_mask']])
    
    # Extract stylometric features
    style_features = extract_stylometric_features(processed_text)
    style_features_df = pd.DataFrame([style_features])
    style_features_normalized = scaler.transform(style_features_df)
    
    # Compute similarity scores
    embedding_reshaped = tf.reshape(embedding_student_chatgpt, [1, -1])
    
    # Similarity with student texts
    student_sim = compute_similarity(embedding_reshaped, reference_student_embeddings)
    # Similarity with chatgpt1 texts
    chatgpt1_sim = compute_similarity(embedding_reshaped, reference_chatgpt1_embeddings)
    # Similarity with chatgpt2 texts
    chatgpt2_sim = compute_similarity(embedding_reshaped, reference_chatgpt2_embeddings)
    
    similarity_scores = np.array([[student_sim.numpy()[0], chatgpt1_sim.numpy()[0], chatgpt2_sim.numpy()[0]]])
    
    # Prepare inputs for classifier
    inputs = {
        "bert_embedding": embedding_student_chatgpt.numpy(),
        "stylometric_features": style_features_normalized,
        "similarity_score": similarity_scores
    }
    
    # Make prediction
    prediction = classifier.predict(inputs)
    probability = float(prediction[0][0])
    
    # Determine classification
    if probability > 0.5:
        classification = "ChatGPT"
    else:
        classification = "Student"
    
    return {
        "probability": probability,
        "classification": classification,
        "stylometric_features": style_features,
        "similarity_scores": {
            "student_similarity": float(student_sim.numpy()[0]),
            "chatgpt1_similarity": float(chatgpt1_sim.numpy()[0]),
            "chatgpt2_similarity": float(chatgpt2_sim.numpy()[0])
        }
    }

# Example usage
def demo_prediction():
    # Load models
    tokenizer, bi_encoder_student_chatgpt, bi_encoder_only_chatgpt, classifier, scaler = load_models()
    
    # Load reference embeddings (these would be saved during training)
    # For demonstration, we'll generate them again
    reference_student_embeddings = generate_embeddings(std_par[:10], bi_encoder_student_chatgpt, tokenizer)
    reference_chatgpt1_embeddings = generate_embeddings(gpt_par_1[:10], bi_encoder_student_chatgpt, tokenizer)
    reference_chatgpt2_embeddings = generate_embeddings(gpt_par_2[:10], bi_encoder_only_chatgpt, tokenizer)
    
    # Example text
    example_student_text = std_par[0]
    example_chatgpt_text = gpt_par_1[0]
    
    # Make predictions
    student_result = predict_text_source(
        example_student_text, 
        tokenizer, 
        bi_encoder_student_chatgpt, 
        bi_encoder_only_chatgpt, 
        classifier, 
        scaler,
        reference_student_embeddings,
        reference_chatgpt1_embeddings,
        reference_chatgpt2_embeddings
    )
    
    chatgpt_result = predict_text_source(
        example_chatgpt_text, 
        tokenizer, 
        bi_encoder_student_chatgpt, 
        bi_encoder_only_chatgpt, 
        classifier, 
        scaler,
        reference_student_embeddings,
        reference_chatgpt1_embeddings,
        reference_chatgpt2_embeddings
    )
    
    print("Student Text Prediction:")
    print(f"Classification: {student_result['classification']}")
    print(f"Probability: {student_result['probability']:.4f}")
    print("\nChatGPT Text Prediction:")
    print(f"Classification: {chatgpt_result['classification']}")
    print(f"Probability: {chatgpt_result['probability']:.4f}")

# Uncomment to run demo
# demo_prediction()

# Create a simple web interface for the model
def create_web_interface():
    import gradio as gr
    
    # Load models
    tokenizer, bi_encoder_student_chatgpt, bi_encoder_only_chatgpt, classifier, scaler = load_models()
    
    # Load reference embeddings
    reference_student_embeddings = generate_embeddings(std_par[:10], bi_encoder_student_chatgpt, tokenizer)
    reference_chatgpt1_embeddings = generate_embeddings(gpt_par_1[:10], bi_encoder_student_chatgpt, tokenizer)
    reference_chatgpt2_embeddings = generate_embeddings(gpt_par_2[:10], bi_encoder_only_chatgpt, tokenizer)
    
    def predict(text):
        result = predict_text_source(
            text, 
            tokenizer, 
            bi_encoder_student_chatgpt, 
            bi_encoder_only_chatgpt, 
            classifier, 
            scaler,
            reference_student_embeddings,
            reference_chatgpt1_embeddings,
            reference_chatgpt2_embeddings
        )
        
        if "error" in result:
            return result["error"]
        
        return f"Classification: {result['classification']}\nProbability: {result['probability']:.4f}"
    
    interface = gr.Interface(
        fn=predict,
        inputs=gr.Textbox(lines=10, placeholder="Enter text to analyze..."),
        outputs="text",
        title="Student vs ChatGPT Text Classifier",
        description="This model analyzes text to determine if it was written by a student or generated by ChatGPT."
    )
    
    interface.launch()

# Uncomment to launch web interface
# create_web_interface()