# Import Libraries


In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import numpy as np
import pandas as pd
import pickle
import os
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import re
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Load Essay Dataset

In [2]:
stdset = pd.read_excel("datasets/esai_siswa_24_25.xlsx")
gptset = pd.read_csv("datasets/esai_gpt_24_25.csv")

In [3]:
stdset.info()
stdset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327 entries, 0 to 326
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   teks_esai     327 non-null    object
 1   nama          327 non-null    object
 2   asal_sekolah  327 non-null    object
 3   tahun         327 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 10.3+ KB


Unnamed: 0,teks_esai,nama,asal_sekolah,tahun
0,P3KTAS: Menciptakan Persatuan dalam Masyarakat...,Achmad Muchasan Nafi,SMA PRIBADI BANDUNG,2024
1,Apakah Anda memiliki teman atau saudara yang b...,Achmad Muchasan Nafi,SMA PRIBADI BANDUNG,2024
2,Dengan adanya perbedaan budaya tersebut banyak...,Achmad Muchasan Nafi,SMA PRIBADI BANDUNG,2024
3,Kasus diskriminasi terbesar di Indonesia lainn...,Achmad Muchasan Nafi,SMA PRIBADI BANDUNG,2024
4,Diskriminasi merupakan salah satu faktor yang ...,Achmad Muchasan Nafi,SMA PRIBADI BANDUNG,2024


In [4]:
gptset.info()
gptset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Response  273 non-null    object
dtypes: object(1)
memory usage: 2.3+ KB


Unnamed: 0,Response
0,harmoni keberagaman: jalan menuju kejayaan ba...
1,di tengah bising derap langkah modernitas yang...
2,kilas balik pada sejarah panjang negeri ini me...
3,"namun, mengelola keragaman tidaklah semudah me..."
4,salah satu modal utama untuk mencapai kejayaan...


In [5]:
def preprocess_text(text):
    """
    Preprocessing text:
    - Convert text to lowercase
    - Cleaning up excess whitespace
    - Keeps the text as a whole paragraph
    
    Args:
        text (str): input text.
        
    Returns:
        str: the processed text or None if invalid.
    """
    if not isinstance(text, str) or text.strip() == "":
        return None
    
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('_x000d_', '')
    return text

# Essay Informations

In [6]:
# Count number of texts per author
author_counts = stdset.groupby('nama')['teks_esai'].count().sort_values(ascending=False)
total_authors = len(author_counts)

# Count sentences in each text
def count_sentences(text):
    if not isinstance(text, str):
        return 0
    # Clean text first using existing preprocess_text function
    text = preprocess_text(text)
    if not text:
        return 0
    # Split by common sentence endings
    sentences = re.split('[.!?]+', text)
    # Remove empty strings
    sentences = [s.strip() for s in sentences if s.strip()]
    return len(sentences)

# Apply sentence counting to teks_esai
sentence_counts = stdset['teks_esai'].apply(count_sentences)

# Calculate statistics
single_sentence = sum(sentence_counts == 1)
multiple_sentences = sum(sentence_counts > 1)
total_texts = len(sentence_counts)

# Create a DataFrame to show statistics per author
author_stats = pd.DataFrame()
author_stats['nama'] = author_counts.index
author_stats['single_sentence'] = [sum(sentence_counts[stdset['nama'] == author] == 1) for author in author_stats['nama']]
author_stats['multiple_sentences'] = [sum(sentence_counts[stdset['nama'] == author] > 1) for author in author_stats['nama']]
author_stats['total_texts'] = author_counts.values

# Display the table
print("Statistics per Author:")
print(author_stats.to_string(index=False))

# Calculate and display summary statistics
print("\nStudent Summary:")
print(f"Total authors: {total_authors}")
print(f"Total texts: {total_texts}")
print(f"Total single sentences: {single_sentence}")
print(f"Total multiple sentences: {multiple_sentences}")
print(f"Minimum multiple sentences per author: {author_stats['multiple_sentences'].min()}")
print(f"Maximum multiple sentences per author: {author_stats['multiple_sentences'].max()}")
mean_multiple_sentences = author_stats['multiple_sentences'].mean()
print(f"Mean multiple sentences per author: {mean_multiple_sentences:.2f}")


Statistics per Author:
                                           nama  single_sentence  multiple_sentences  total_texts
                     Syarifuddin Muhammad Wahib                2                  28           30
                       Nandra Nadhesya Larasati                2                  18           20
                               Arif Nurfadlilah                5                  15           20
                        Nadia Izzati Firmansyah                1                  19           20
                                Dewi Tyas Utami                1                  18           19
                                   Levi Linardi                1                  18           19
                                 Shulhan Tasdiq                2                  15           17
                      Sekar Anggita Widyatamaka                1                  16           17
                             Belva Aqila Irwani                1                  15           

In [7]:
# Create subsets based on year
author_stats_2024 = author_stats[author_stats['nama'].isin(stdset[stdset['tahun'] == 2024]['nama'].unique())]
author_stats_2025 = author_stats[author_stats['nama'].isin(stdset[stdset['tahun'] == 2025]['nama'].unique())]

# Print statistics for 2024
print("\nStatistics for 2024:")
print(f"Total authors: {len(author_stats_2024)}")
print(f"Total texts: {author_stats_2024['total_texts'].sum()}")
print(f"Total single sentences: {author_stats_2024['single_sentence'].sum()}")
print(f"Total multiple sentences: {author_stats_2024['multiple_sentences'].sum()}")
print(f"Minimum multiple sentences per author: {author_stats_2024['multiple_sentences'].min()}")
print(f"Maximum multiple sentences per author: {author_stats_2024['multiple_sentences'].max()}")
print(f"Mean multiple sentences per author: {author_stats_2024['multiple_sentences'].mean():.2f}")

# Print statistics for 2025  
print("\nStatistics for 2025:")
print(f"Total authors: {len(author_stats_2025)}")
print(f"Total texts: {author_stats_2025['total_texts'].sum()}")
print(f"Total single sentences: {author_stats_2025['single_sentence'].sum()}")
print(f"Total multiple sentences: {author_stats_2025['multiple_sentences'].sum()}")
print(f"Minimum multiple sentences per author: {author_stats_2025['multiple_sentences'].min()}")
print(f"Maximum multiple sentences per author: {author_stats_2025['multiple_sentences'].max()}")
print(f"Mean multiple sentences per author: {author_stats_2025['multiple_sentences'].mean():.2f}")



Statistics for 2024:
Total authors: 10
Total texts: 154
Total single sentences: 12
Total multiple sentences: 142
Minimum multiple sentences per author: 12
Maximum multiple sentences per author: 19
Mean multiple sentences per author: 14.20

Statistics for 2025:
Total authors: 10
Total texts: 173
Total single sentences: 17
Total multiple sentences: 156
Minimum multiple sentences per author: 10
Maximum multiple sentences per author: 28
Mean multiple sentences per author: 15.60


In [8]:
sentence_counts = gptset['Response'].apply(count_sentences)

# Calculate statistics
single_sentence = sum(sentence_counts == 1)
multiple_sentences = sum(sentence_counts > 1)
total_texts = len(sentence_counts)

print("\nChatGPT Summary:")
print(f"Total texts: {total_texts}")
print(f"Total single sentences: {single_sentence}")
print(f"Total multiple sentences: {multiple_sentences}")
print(f"Minimum multiple sentences per author: {author_stats['multiple_sentences'].min()}")
print(f"Maximum multiple sentences per author: {author_stats['multiple_sentences'].max()}")
mean_multiple_sentences = author_stats['multiple_sentences'].mean()
print(f"Mean multiple sentences per author: {mean_multiple_sentences:.2f}")


ChatGPT Summary:
Total texts: 273
Total single sentences: 20
Total multiple sentences: 253
Minimum multiple sentences per author: 10
Maximum multiple sentences per author: 28
Mean multiple sentences per author: 14.90


# Load Dataset for Training

In [9]:
#load dataset
std_txt = []
gpt_txt = []

# Process Student essay
for text in stdset['teks_esai']:
    processed = preprocess_text(text)
    if processed:
        std_txt.append(processed)

# Process ChatGPT essay
for text in gptset['Response']:
    processed = preprocess_text(text)
    if processed:
        gpt_txt.append(processed)


# Show total number of processed texts
print(f"Total student text: {len(std_txt)}")
print(f"Total ChatGPT text: {len(gpt_txt)}")

Total student text: 327
Total ChatGPT text: 273


# Initialize BERT Tokenizer

In [None]:
# Initialize IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Tokenize function
def tokenize_text(texts, max_length=256):
    """
    Text tokenization using IndoBERT tokenizer.
    
    Args:
        texts (list): List of texts to be tokenized.
        max_length (int): Maximum token length.
        
    Returns:
        dict: Tokenized result, including input_ids and attention_mask.
    """
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

In [None]:
# Tokenize student and ChatGPT essays
print("Tokenize student essay...")
std_tokens = tokenize_text(std_txt)

print("Tokenize ChatGPT essay...")
gpt_tokens = tokenize_text(gpt_txt)

# Display tokenization results (example: Student)
print("\nExample of tokenization results:")
print(std_tokens['input_ids'][:3])  # Input token ID
print(std_tokens['attention_mask'][:3])  # Attention mask

In [None]:
student_lengths = [sum(mask) for mask in std_tokens['attention_mask'].numpy()]
chatgpt_lengths = [sum(mask) for mask in gpt_tokens['attention_mask'].numpy()]

plt.figure(figsize=(10, 6))
plt.hist(student_lengths, bins=30, alpha=0.5, label='Student', color='purple')
plt.hist(chatgpt_lengths, bins=30, alpha=0.5, label='ChatGPT', color='orange')
plt.title('Distribution of Token Lengths')
plt.xlabel('Active Token Count')
plt.ylabel('Frequency')
plt.show()
print(f"Average token length for Student: {np.mean(student_lengths):.2f}")
print(f"Percentage truncated for Student: {sum(l == 256 for l in student_lengths) / len(student_lengths) * 100:.2f}%")
print(f"Average token length for ChatGPT: {np.mean(chatgpt_lengths):.2f}")
print(f"Percentage truncated for ChatGPT: {sum(l == 256 for l in chatgpt_lengths) / len(chatgpt_lengths) * 100:.2f}%")

In [None]:
# Decoding tokens for ensuring correctness
sample_text = gpt_txt[0]
sample_tokens = tokenizer.encode(sample_text)
print(f"Teks asli: {sample_text}")
print(f"Token ID: {sample_tokens}")
print(f"Token dekode: {tokenizer.decode(sample_tokens)}")

In [None]:
# Save tokenized data
tokenized_data = {
    'student': std_tokens,
    'chatgpt': gpt_tokens
}

# Save input_ids and attention_mask as numpy arrays
tokenized_numpy = {
    'student': {
        'input_ids': std_tokens['input_ids'].numpy(),
        'attention_mask': std_tokens['attention_mask'].numpy()
    },
    'chatgpt ': {
        'input_ids': gpt_tokens ['input_ids'].numpy(),
        'attention_mask': gpt_tokens ['attention_mask'].numpy()
    }
}

os.makedirs("paper", exist_ok=True)

with open('paper/tokenized_data.pkl', 'wb') as f:
    pickle.dump(tokenized_numpy, f)

# Build IndoBERT Semantic Similarity Model

In [None]:
# Load IndoBERT model
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

# Freeze BERT layers
for layer in bert_model.layers:
    layer.trainable = False

# Define Bi-Encoder model
def model(bert_model):
    """
    Create a Bi-Encoder model with IndoBERT.
    
    Args:
        bert_model (TFBertModel): Base model of IndoBERT.
        
    Returns:
        tf.keras.Model: Bi-Encoder model.
    """
    # Input layer
    input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="attention_mask")
    
    # Extract CLS token embeddings from IndoBERT
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]  # [CLS] token
    
    # Dense layer for fine-tuning
    dense1 = tf.keras.layers.Dense(256, activation="relu")(bert_output)
    dropout1 = tf.keras.layers.Dropout(0.1)(dense1)
    dense2 = tf.keras.layers.Dense(256, activation="relu")(dropout1)
    dropout2 = tf.keras.layers.Dropout(0.1)(dense2)
    dense3 = tf.keras.layers.Dense(256)(dropout2)
    
    # Output normalization (L2 normalization)
    normalized_output = tf.nn.l2_normalize(dense3, axis=1)
    
    # Semantic model
    return tf.keras.Model(inputs=[input_ids, attention_mask], outputs=normalized_output)

# Build model
semantic_model = model(bert_model)

# Show model summary
print("Model Summary:")
semantic_model.summary()


# Create Contrastive Pairs

In [None]:
def create_contrastive_pairs(student_tokens, chatgpt_tokens, max_pairs=None):
    """
    Creates data pairs for contrastive learning with dataset-appropriate quantities.

    Args:
        student_tokens: Tokenized student text
        chatgpt_tokens_1: First set of tokenized ChatGPT text
        chatgpt_tokens_2: Second set of tokenized ChatGPT text
        max_pairs: Maximum number of pairs (optional). If None, uses all possible combinations.

    Returns:
        tuple: Dictionary of anchor, positive, negative pairs and labels, and the total number of pairs.
    """
    # Get dataset sizes
    n_student = student_tokens['input_ids'].shape[0]
    n_chatgpt = chatgpt_tokens['input_ids'].shape[0]
    
    # Calculate maximum possible combinations
    max_student_pairs = (n_student * (n_student - 1)) // 2  # student-student combinations
    max_chatgpt_pairs = (n_chatgpt * (n_chatgpt - 1)) // 2  # chatgpt-chatgpt combinations
    max_negative_pairs = n_student * n_chatgpt  # student-chatgpt combinations
    
    # Determine number of pairs to create
    if max_pairs is None:
        # Use minimum number of positive pairs for balance
        n_pos_student = min(max_student_pairs, max_chatgpt_pairs) // 2
        n_pos_chatgpt = n_pos_student
        # Limit negative pairs to balance with positives
        n_neg_pairs = min(max_negative_pairs, 2 * n_pos_student)
    else:
        # If max_pairs is specified, use that with equal proportions
        n_pos_student = max_pairs // 4
        n_pos_chatgpt = max_pairs // 4
        n_neg_pairs = max_pairs // 2
    
    # Ensure we don't exceed the maximum possible combinations
    n_pos_student = min(n_pos_student, max_student_pairs)
    n_pos_chatgpt = min(n_pos_chatgpt, max_chatgpt_pairs)
    n_neg_pairs = min(n_neg_pairs, max_negative_pairs)
    
    # Initialize arrays for data pairs
    anchor_input_ids = []
    anchor_attention_mask = []
    positive_input_ids = []
    positive_attention_mask = []
    negative_input_ids = []
    negative_attention_mask = []
    labels = []
    
    # Generate positive student-student pairs
    if n_pos_student > 0:
        # Create all possible student-student pairs
        student_pairs = [(i, j) for i in range(n_student) for j in range(i+1, n_student)]
        # Randomly select pairs
        selected_pairs = random.sample(student_pairs, n_pos_student)
        
        for idx1, idx2 in selected_pairs:
            # Anchor (student)
            anchor_input_ids.append(student_tokens['input_ids'][idx1])
            anchor_attention_mask.append(student_tokens['attention_mask'][idx1])
            
            # Positive (another student)
            positive_input_ids.append(student_tokens['input_ids'][idx2])
            positive_attention_mask.append(student_tokens['attention_mask'][idx2])
            
            # Negative (from ChatGPT)
            neg_idx = np.random.choice(n_chatgpt)
            negative_input_ids.append(chatgpt_tokens['input_ids'][neg_idx])
            negative_attention_mask.append(chatgpt_tokens['attention_mask'][neg_idx])
            
            # Label (1 for positive pair)
            labels.append(1)
    
    # Generate positive chatgpt-chatgpt pairs
    if n_pos_chatgpt > 0:
        # Create all possible chatgpt-chatgpt pairs
        chatgpt_pairs = [(i, j) for i in range(n_chatgpt) for j in range(i+1, n_chatgpt)]
        # Randomly select pairs
        selected_pairs = random.sample(chatgpt_pairs, n_pos_chatgpt)
        
        for idx1, idx2 in selected_pairs:
            # Anchor
            anchor_input_ids.append(chatgpt_tokens['input_ids'][idx1])
            anchor_attention_mask.append(chatgpt_tokens['attention_mask'][idx1])
            
            # Positive
            positive_input_ids.append(chatgpt_tokens['input_ids'][idx2])
            positive_attention_mask.append(chatgpt_tokens['attention_mask'][idx2])
            
            # Negative (from Student)
            neg_idx = np.random.choice(n_student)
            negative_input_ids.append(student_tokens['input_ids'][neg_idx])
            negative_attention_mask.append(student_tokens['attention_mask'][neg_idx])
            
            # Label (1 for positive pair)
            labels.append(1)
    
    # Generate negative student-chatgpt pairs
    if n_neg_pairs > 0:
        # Create all possible student-chatgpt pairs
        negative_pairs = [(i, j) for i in range(n_student) for j in range(n_chatgpt)]
        # Randomly select pairs
        selected_pairs = random.sample(negative_pairs, n_neg_pairs)
        
        for student_idx, chatgpt_idx in selected_pairs:
            # Anchor (Student)
            anchor_input_ids.append(student_tokens['input_ids'][student_idx])
            anchor_attention_mask.append(student_tokens['attention_mask'][student_idx])
            
            # Negative (ChatGPT)
            negative_input_ids.append(chatgpt_tokens['input_ids'][chatgpt_idx])
            negative_attention_mask.append(chatgpt_tokens['attention_mask'][chatgpt_idx])
            
            # Positive (another Student different from anchor)
            available_pos = [i for i in range(n_student) if i != student_idx]
            if available_pos:  # Ensure there are available indices
                pos_idx = np.random.choice(available_pos)
                positive_input_ids.append(student_tokens['input_ids'][pos_idx])
                positive_attention_mask.append(student_tokens['attention_mask'][pos_idx])
                
                # Label (0 for negative pair)
                labels.append(0)
    
    # Count actual pairs created
    actual_pairs = len(labels)
    
    # Convert to tensors and return
    return {
        'anchor': {
            'input_ids': tf.convert_to_tensor(anchor_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(anchor_attention_mask, dtype=tf.int32)
        },
        'positive': {
            'input_ids': tf.convert_to_tensor(positive_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(positive_attention_mask, dtype=tf.int32)
        },
        'negative': {
            'input_ids': tf.convert_to_tensor(negative_input_ids, dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(negative_attention_mask, dtype=tf.int32)
        },
        'labels': tf.convert_to_tensor(labels, dtype=tf.float32)
    }, actual_pairs

In [None]:
# Create contrastive pairs
student_chatgpt_pairs, total_pairs = create_contrastive_pairs(std_tokens, gpt_tokens, max_pairs=None)

# Show the number of pairs created
print(f"Total contrastive pairs yang dibuat: {total_pairs}")
print(f"- Pasangan positif student-student: {sum(1 for label in student_chatgpt_pairs['labels'].numpy() if label == 1)//2}")
print(f"- Pasangan positif chatgpt-chatgpt: {sum(1 for label in student_chatgpt_pairs['labels'].numpy() if label == 1)//2}")
print(f"- Pasangan negatif student-chatgpt: {sum(1 for label in student_chatgpt_pairs['labels'].numpy() if label == 0)}")

# Build Triplet Input

In [None]:
# Model for training with triplet loss
def triplet_model(semantic_model):
    """
    Build a model for training with triplet loss.
    
    Args:
        semantic_model: The semantic similarity model to be trained.
        
    Returns:
         tf.keras.Model: Model for training with triplet loss.
    """
    # Input for anchor, positive, and negative
    anchor_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="anchor_input_ids")
    anchor_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="anchor_attention_mask")
    
    positive_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="positive_input_ids")
    positive_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="positive_attention_mask")
    
    negative_input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="negative_input_ids")
    negative_attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int32, name="negative_attention_mask")
    
    # Embedding for anchor, positive, and negative
    anchor_embedding = semantic_model([anchor_input_ids, anchor_attention_mask])
    positive_embedding = semantic_model([positive_input_ids, positive_attention_mask])
    negative_embedding = semantic_model([negative_input_ids, negative_attention_mask])
    
    # measure cosine similarity
    pos_similarity = tf.reduce_sum(anchor_embedding * positive_embedding, axis=1)
    neg_similarity = tf.reduce_sum(anchor_embedding * negative_embedding, axis=1)
    
    # Model output is the similarity score
    output = tf.stack([pos_similarity, neg_similarity], axis=1)
    
    return tf.keras.Model(
        inputs=[
            anchor_input_ids, anchor_attention_mask,
            positive_input_ids, positive_attention_mask,
            negative_input_ids, negative_attention_mask
        ],
        outputs=output
    )

# Create Triplet Loss

In [None]:
# Triplet loss function
def triplet_loss(y_true, y_pred):
    """
    Triplet loss: tunes the network such that
the distance between a and p is smaller than the
distance between a and n.
    
    Args:
        y_true: not used triplet loss.
        y_pred: stack of [positive_similarity, negative_similarity].
        
    Returns:
        tf.Tensor: loss value.
    """
    pos_sim = y_pred[:, 0]
    neg_sim = y_pred[:, 1]
    margin = 0.5
    
    # Triplet loss: max(0, margin - (pos_sim - neg_sim))
    loss = tf.maximum(0., margin - (pos_sim - neg_sim))
    return tf.reduce_mean(loss)

# Train The Model

In [None]:
# Build triplet model for student vs ChatGPT essay
build_triplet = triplet_model(semantic_model)

# Compile model
build_triplet.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=triplet_loss
)

# Training model Student_ChatGPT
print("Training model...")
history = build_triplet.fit(
    x=[
        student_chatgpt_pairs['anchor']['input_ids'],
        student_chatgpt_pairs['anchor']['attention_mask'],
        student_chatgpt_pairs['positive']['input_ids'],
        student_chatgpt_pairs['positive']['attention_mask'],
        student_chatgpt_pairs['negative']['input_ids'],
        student_chatgpt_pairs['negative']['attention_mask']
    ],
    y=student_chatgpt_pairs['labels'],  # not used in triplet loss
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    ]
)

# Training History

In [None]:
# Plot training history untuk model Student_ChatGPT
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Student_ChatGPT: Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Generate Embeddings Function

In [None]:
def gen_emb(tokens, model):
    """
    Generate embeddings using IndoBERT.
    
    Args:
        tokens: Token from text.
        model: IndoBERT Semantic Similarity.
        
    Returns:
        tf.Tensor: Embeddings.
    """
    return model([tokens['input_ids'], tokens['attention_mask']])

# Cosine Similarity Function

In [None]:
def cos_sim(embedding1, embedding2):
    """
    Calculate similarity score using standard cosine similarity.
    
    Args:
        embedding1: First embedding (input text)
        embedding2: Second embedding (reference)
        
    Returns:
        float: Average cosine similarity score
    """
    # Normalize embeddings (L2 norm)
    embedding1_norm = tf.nn.l2_normalize(embedding1, axis=-1)
    embedding2_norm = tf.nn.l2_normalize(embedding2, axis=-1)
    
    # Calculate cosine similarity
    similarity = tf.reduce_sum(embedding1_norm * embedding2_norm)
    
    return similarity.numpy()


In [None]:
if not os.path.exists('paper'):
    os.makedirs('paper')

history.save('paper/semantic_model.h5')

# Save tokenizer configuration
tokenizer.save_pretrained('paper/tokenizer')

In [None]:
custom_objects = {'TFBertModel': TFBertModel}

with tf.keras.utils.custom_object_scope(custom_objects):
    semantic_model = tf.keras.models.load_model('paper/semantic_model.h5')

# Generate Embeddings

In [None]:
# Generate embeddings untuk semua data
print("Generating embeddings for Student...")
std_emb = gen_emb(std_tokens, semantic_model)
print("Generating embeddings for ChatGPT...")
gpt_emb = gen_emb(gpt_tokens, semantic_model)

# Measure Similarity Score

In [None]:
# 1. Similarity dengan Student (dari model Student_ChatGPT)
std_std_sim_scores = []
std_gpt_sim_scores = []
for emb in std_emb:
    avg_similarity = cos_sim(tf.expand_dims(emb, 0), std_emb)
    std_std_sim_scores.append(avg_similarity)

    avg_similarity = cos_sim(tf.expand_dims(emb, 0), gpt_emb)
    std_gpt_sim_scores.append(avg_similarity)

# 2. Similarity dengan ChatGPT (dari model Student_ChatGPT)
gpt_std_sim_scores = []
gpt_gpt_sim_scores = []
for emb in gpt_emb:
    avg_similarity = cos_sim(tf.expand_dims(emb, 0), std_emb)
    gpt_std_sim_scores.append(avg_similarity)

    avg_similarity = cos_sim(tf.expand_dims(emb, 0), gpt_emb)
    gpt_gpt_sim_scores.append(avg_similarity)

In [None]:
min_length = min(len(std_std_sim_scores), len(std_gpt_sim_scores))
student_similarity_scores = np.array([
    std_std_sim_scores[:min_length], 
    std_gpt_sim_scores[:min_length],
])

min_length = min(len(gpt_std_sim_scores), len(gpt_gpt_sim_scores))
chatgpt_similarity_scores = np.array([
    gpt_std_sim_scores[:min_length], 
    gpt_gpt_sim_scores[:min_length],
])

print(student_similarity_scores.shape)
print(chatgpt_similarity_scores.shape)

ref_emb = {
    'student': {
        'embeddings': std_emb.numpy(),
        'similarity_scores': student_similarity_scores
    },
    'chatgpt': {
        'embeddings': gpt_emb.numpy(),
        'similarity_scores': chatgpt_similarity_scores
    }
}

In [None]:
all_pairs = []
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.width', 1000)        
pd.set_option('display.max_rows', None)      

for i, student_text in enumerate(std_txt):
    if i >= len(std_emb):
        continue
        
    student_embedding = tf.expand_dims(std_emb[i], 0)

    for j in range(len(gpt_txt)):
        if j >= len(gpt_emb):
            continue

        chatgpt_embedding = tf.expand_dims(gpt_emb[j], 0)
        similarity = cos_sim(student_embedding, chatgpt_embedding)
        
        all_pairs.append({
            'student_idx': i,
            'chatgpt_idx': j,
            'student_text': student_text,
            'chatgpt_text': gpt_txt[j],
            'similarity_score': similarity
        })

sorted_pairs = sorted(all_pairs, key=lambda x: x['similarity_score'], reverse=True)

result_v1 = sorted_pairs[:5]

df_v1 = pd.DataFrame(result_v1)[['student_text', 'chatgpt_text', 'similarity_score']]
df_v1.columns = ['Student Essay', 'ChatGPT Essay', 'Similarity Score']

print("=== 1: Standard ===")
display(df_v1)

used_student_indices_v2 = set()
used_chatgpt_indices_v2 = set()
result_v2 = []

for pair in sorted_pairs:
    if (pair['student_idx'] in used_student_indices_v2 or 
        pair['chatgpt_idx'] in used_chatgpt_indices_v2):
        continue
    result_v2.append(pair)
    used_student_indices_v2.add(pair['student_idx'])
    used_chatgpt_indices_v2.add(pair['chatgpt_idx'])
    if len(result_v2) >= 5:
        break

df_v2 = pd.DataFrame(result_v2)[['student_text', 'chatgpt_text', 'similarity_score']]
df_v2.columns = ['Student Essay', 'ChatGPT Essay', 'Similarity Score']

print("=== 2: Unique ===")
display(df_v2)

# Visualization of Similarity Score

In [None]:
plt.figure(figsize=(15, 6))

student_student_color = "#9747FF"  
student_chatgpt_color = "#FCD19C"  
chatgpt_chatgpt_color = "#FFA629"  
chatgpt_student_color = "#E4CCFF"  

# Subplot 1: Student
plt.subplot(1, 3, 1)
plt.scatter(range(len(std_std_sim_scores)), std_std_sim_scores, 
            label='Student-Student', color=student_student_color, s=70, alpha=0.8, edgecolor='black', linewidth=0.5)
plt.scatter(range(len(std_gpt_sim_scores)), std_gpt_sim_scores, 
            label='Student-ChatGPT', color=student_chatgpt_color, s=70, alpha=0.8, edgecolor='black', linewidth=0.5)
plt.title('Student Essay', fontsize=14, fontweight='bold')
plt.ylim(-1.05, 1.05)  
plt.yticks(np.arange(-1, 1.1, 0.1))
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.3)

# Subplot 2: ChatGPT
plt.subplot(1, 3, 2)
plt.scatter(range(len(gpt_std_sim_scores)), gpt_std_sim_scores, 
            label='ChatGPT-Student', color=chatgpt_student_color, s=70, alpha=0.8, edgecolor='black', linewidth=0.5)
plt.scatter(range(len(gpt_gpt_sim_scores)), gpt_gpt_sim_scores, 
            label='ChatGPT-ChatGPT', color=chatgpt_chatgpt_color, s=70, alpha=0.8, edgecolor='black', linewidth=0.5)
plt.title('ChatGPT Essay', fontsize=14, fontweight='bold')
plt.ylim(-1.05, 1.05)  
plt.yticks(np.arange(-1, 1.1, 0.1))
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.3)

plt.suptitle('Similarity Scores Comparison', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.subplots_adjust(top=0.85)  

os.makedirs('paper/images', exist_ok=True)
plt.savefig('paper/images/similarity_scores_comparison.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
with open('paper/reference_embeddings_1.pkl', 'wb') as f:
    pickle.dump(ref_emb, f)

print("Reference embeddings saved successfully!")

# Define Linguistic Features

In [None]:
def linguistic_features(text):
    """
    Features extraction from text:
    1.	Lexical Diversity
    2.	Total words in the essay
    3.	Total unique words*
    4.	Modals
    5.	Stopwords ratio*
    6.	Average sentence length*
    7.	Sentence length variation*
    8.	Punctuation Ratio*

    
    Args:
        text (str): Input text.
        
    Returns:
        dict: Linguistic features.
    """
    words = re.findall(r'\b\w+\b', text)
    sentences = re.split(r'[.!?]', text)
    sentences = [s.strip() for s in sentences if s.strip()]

    word_count = len(words)
    unique_count = len(set(words))
    
    ld = (unique_count / word_count * 100) if word_count > 0 else 0
    
    # Load modals from corpus file
    modals = set()
    if os.path.exists('corpus/Indonesian_Manually_Tagged_Corpus_ID.tsv'):
        with open('corpus/Indonesian_Manually_Tagged_Corpus_ID.tsv', 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    parts = line.strip().split('\t')
                    if len(parts) >= 2 and parts[1] == 'MD':
                        modals.add(parts[0].lower())
    
    # Count modals in text
    modal_count = sum(1 for word in words if word.lower() in modals)
    
    # Load stopwords from file
    stopwords = set()
    if os.path.exists('corpus/stopwords.txt'):
        with open('corpus/stopwords.txt', 'r', encoding='utf-8') as f:
            for line in f:
                stopwords.add(line.strip())
    
    # Calculate stopword ratio
    stopword_count = sum(1 for word in words if word.lower() in stopwords)
    stopword_ratio = (stopword_count / word_count * 100) if word_count > 0 else 0
    
    # Calculate sentence length statistics
    sentence_lengths = [len(re.findall(r'\b\w+\b', s)) for s in sentences]
    avg_sent_len = np.mean(sentence_lengths) if sentence_lengths else 0
    sent_len_var = np.std(sentence_lengths) if len(sentence_lengths) > 1 else 0
    
    # Calculate punctuation ratio
    punct_count = len(re.findall(r'[.!?]', text))
    punct_ratio = (punct_count / word_count) * 100 if word_count > 0 else 0

    return {
        'lexical_diversity': ld,
        'total_words': word_count,
        'total_unique_words': unique_count,
        'modals': modal_count,
        'stopwords_ratio': stopword_ratio,
        'avg_sentence_length': avg_sent_len,
        'sentence_length_variation': sent_len_var,
        'punctuation_ratio': punct_ratio
    }

# Features Extraction

In [None]:
# Feature extraction for Student and ChatGPT essays
print("Features extraction for Student...")
std_features = [linguistic_features(text) for text in std_txt]

print("Features extraction for ChatGPT...")
gpt_features = [linguistic_features(text) for text in gpt_txt]


# convert
std_features_df = pd.DataFrame(std_features)
gpt_features_df = pd.DataFrame(gpt_features)


# Tampilkan beberapa fitur hasil ekstraksi
print("\nStudent Linguistic Features:")
display(std_features_df.head())

print("\nChatGPT Linguistic Features:")
display(gpt_features_df.head())

# Normalize Linguistic Features

In [None]:
# Combine features into a single DataFrame
all_features = pd.concat([std_features_df, gpt_features_df], axis=0)

# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(all_features)

# Separate normalized features back into student and ChatGPT
n_student = len(std_features_df)
n_chatgpt_1 = len(gpt_features_df)

std_features_norm = normalized_features[:n_student]
gpt_features_norm = normalized_features[n_student:n_student + n_chatgpt_1]

print("Student features after normalization:")
print(std_features_norm[:5])

print("ChatGPT features after normalization:")
print(gpt_features_norm[:5])

# Save scaler for later inference
try:
    with open('paper/scaler_stylometric.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    print("Scaler saved successfully.")
except Exception as e:
    print(f"Error saving scaler: {e}")

In [None]:
# Add labels to features
std_features_df['label'] = 'Student Essay'
gpt_features_df['label'] = 'ChatGPT Essay'

# Combine datasets
combined_features = pd.concat([std_features_df, gpt_features_df], axis=0)

# Visualize Linguistic Features

In [None]:
plt.figure(figsize=(15, 20))
for i, feature in enumerate([
    'lexical_diversity',
    'total_words',
    'total_unique_words',
    'modals',
    'stopwords_ratio',
    'avg_sentence_length',
    'sentence_length_variation',
    'punctuation_ratio'
]):
    plt.subplot(3, 3, i+1)
    sns.boxplot(x='label', y=feature, data=combined_features)
    plt.title(f'Distribution {feature}')
plt.tight_layout()
plt.show()

In [None]:
print(f"Student Similarity Score shape: {student_similarity_scores.shape}")
print(f"ChatGPT Similarity Score shape: {chatgpt_similarity_scores.shape}")

In [None]:
# Combine embeddings for model 1 (already correct)
emb_features = np.vstack([std_emb.numpy(), gpt_emb.numpy()])

student_features_selected = std_features_norm
chatgpt_features_selected = gpt_features_norm

all_linguistic_features = np.vstack([
    student_features_selected,
    chatgpt_features_selected
])

all_similarity_scores = np.vstack([
    student_similarity_scores,
    chatgpt_similarity_scores,
])

# Create labels
student_labels = np.zeros(len(std_emb))  # Label 0 for Student
chatgpt_labels = np.ones(len(gpt_emb))   # Label 1 for ChatGPT
all_labels = np.hstack([student_labels, chatgpt_labels])

# Verify shapes
print(f"embeddings_model shape: {emb_features.shape}")
print(f"all_linguistic_features shape: {all_linguistic_features.shape}")
print(f"all_similarity_scores shape: {all_similarity_scores.shape}")
print(f"all_labels shape: {all_labels.shape}")

# Build Classification Model

In [None]:
emb_input = tf.keras.layers.Input(
    shape=(256,),
    dtype=tf.float32, 
    name="bert_embedding_1"
)

sim_score_input = tf.keras.layers.Input(
    shape=(2,),  # Shape score similarity (Student, ChatGPT1, ChatGPT2)
    dtype=tf.float32, 
    name="similarity_score"
)

linguistic_input = tf.keras.layers.Input(
    shape=(8,),
    dtype=tf.float32, 
    name="stylometric_features"
)

emb_dense = tf.keras.layers.Dense(256, activation="relu")(emb_input)
sim_dense = tf.keras.layers.Dense(16, activation="relu")(sim_score_input)
lin_dense = tf.keras.layers.Dense(64, activation="relu")(linguistic_input)

combined = tf.keras.layers.Concatenate()([emb_dense, sim_dense, lin_dense])

x = tf.keras.layers.Dense(128, activation="relu")(combined)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

classifier = tf.keras.Model(
    inputs=[emb_input, sim_score_input, linguistic_input],
    outputs=output,
    name="text_classifier"
)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

classifier.summary()

In [None]:
texts = std_txt + gpt_txt
labels = [0] * len(std_txt) + [1] * len(gpt_txt)  # 0 for student, 1 for ChatGPT
data = pd.DataFrame({'text': texts, 'label': labels})

train_data, test_data = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data['label']
)

print(f"Initial training set: {len(train_data)} samples")
print(f"Initial test set: {len(test_data)} samples")
print(f"Initial training distribution: Student={sum(train_data['label']==0)}, ChatGPT={sum(train_data['label']==1)}")


X_train = train_data[['text']]
y_train = train_data['label']

undersampler = RandomUnderSampler(random_state=42)
X_train_indices = pd.DataFrame({'index': range(len(X_train))})
X_resampled_indices, y_resampled = undersampler.fit_resample(X_train_indices, y_train)

selected_indices = X_resampled_indices['index'].values

balanced_train_data = train_data.iloc[selected_indices].reset_index(drop=True)

removed_indices = set(range(len(train_data))) - set(selected_indices)
removed_samples = train_data.iloc[list(removed_indices)]

test_set = pd.concat([test_data, removed_samples]).reset_index(drop=True)

train_set, val_set = train_test_split(
    balanced_train_data, test_size=0.05/0.80, random_state=42, stratify=balanced_train_data['label']
)

# Print final dataset statistics
print("\nAfter undersampling:")
print(f"Training set: {len(train_set)} samples (Student={sum(train_set['label']==0)}, ChatGPT={sum(train_set['label']==1)})")
print(f"Validation set: {len(val_set)} samples (Student={sum(val_set['label']==0)}, ChatGPT={sum(val_set['label']==1)})")
print(f"Test set: {len(test_set)} samples (Student={sum(test_set['label']==0)}, ChatGPT={sum(test_set['label']==1)})")


# Prepare inputs for training, validation, and test
train_inputs = {
    "embeddings_input": emb_input[train_set],
    "similarity_score": sim_score_input[train_set],
    "linguistic_features": linguistic_input[train_set]
}

val_inputs = {
    "embeddings_input": emb_input[val_set],
    "similarity_score": sim_score_input[val_set],
    "linguistic_features": linguistic_input[val_set]
}


test_inputs = {
    "embeddings_input": emb_input[test_set],
    "similarity_score": sim_score_input[test_set],
    "linguistic_features": linguistic_input[test_set]
}

train_labels = all_labels[train_set]
val_labels = all_labels[train_set]
test_labels = all_labels[train_set]

In [None]:
# Training classifier
print("Training Classification Model...")
history_classifier = classifier.fit(
    train_inputs,
    train_labels,
    validation_data=(val_inputs, val_labels),
    epochs=20,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_classifier.history['loss'], label='Training Loss')
plt.plot(history_classifier.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_classifier.history['accuracy'], label='Training Accuracy')
plt.plot(history_classifier.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Model evaluation
print("Evaluating model on validation set...")
val_loss, val_acc, val_precision, val_recall, val_auc = classifier.evaluate(val_inputs, val_labels)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation AUC: {val_auc:.4f}")

# Predictions on validation set
print("Making predictions on validation set...")
val_predictions = classifier.predict(val_inputs)
val_predictions_binary = (val_predictions > 0.5).astype(int)

# Confusion matrix
cm = confusion_matrix(val_labels, val_predictions_binary)

# Visualisasi confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Student', 'ChatGPT'], yticklabels=['Student', 'ChatGPT'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_predictions_binary, target_names=['Student', 'ChatGPT']))

# Model evaluation on test set
print("Evaluating model on test set...")
test_loss, test_acc, test_precision, test_recall, test_auc = classifier.evaluate(test_inputs, test_labels)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test AUC: {test_auc:.4f}")

# Predictions on test set
print("Making predictions on test set...")
test_predictions = classifier.predict(test_inputs)
test_predictions_binary = (test_predictions > 0.5).astype(int)

# Confusion matrix for test set
cm_test = confusion_matrix(test_labels, test_predictions_binary)

# Visualizing confusion matrix for test set
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['Student', 'ChatGPT'], yticklabels=['Student', 'ChatGPT'])
plt.title('Test Set Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report for test set
print("\nTest Set Classification Report:")
print(classification_report(test_labels, test_predictions_binary, target_names=['Student', 'ChatGPT']))


In [None]:
# Plot ROC curve
def plot_roc_curve(labels, predictions, title):
    fpr, tpr, _ = roc_curve(labels, predictions)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {title}')
    plt.legend(loc="lower right")
    plt.show()
    
    return roc_auc

# Plot ROC curve for validation set
val_auc_score = plot_roc_curve(val_labels, val_predictions, "Validation Set")
print(f"Validation AUC from ROC curve: {val_auc_score:.4f}")

# Plot ROC curve for test set
test_auc_score = plot_roc_curve(test_labels, test_predictions, "Test Set")
print(f"Test AUC from ROC curve: {test_auc_score:.4f}")

# Plot both ROC curves in one graph for comparison
plt.figure(figsize=(8, 6))

# Validation set
fpr_val, tpr_val, _ = roc_curve(val_labels, val_predictions)
roc_auc_val = auc(fpr_val, tpr_val)
plt.plot(fpr_val, tpr_val, color='darkorange', lw=2, label=f'Validation ROC (area = {roc_auc_val:.2f})')

# Test set
fpr_test, tpr_test, _ = roc_curve(test_labels, test_predictions)
roc_auc_test = auc(fpr_test, tpr_test)
plt.plot(fpr_test, tpr_test, color='green', lw=2, label=f'Test ROC (area = {roc_auc_test:.2f})')

# Diagonal line
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Comparison')
plt.legend(loc="lower right")
plt.show()

In [None]:
if not os.path.exists('paper'):
    os.makedirs('paper')

history.save('paper/semantic_model.h5')
classifier.save('paper/classification_model.h5')

tokenizer.save_pretrained('paper/tokenizer')

with open("paper/scaler_stylometric.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Model and configuration successfully saved!")

In [None]:
%load_ext watermark
%watermark
%watermark -iv --gpu