## 1. Import Library

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import matplotlib as plt
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import pickle
from sklearn.model_selection import train_test_split
import os
import re
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize


# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Set seed untuk reproducibility
tf.random.set_seed(42)
np.random.seed(42)

## 2. Load Dataset

Dataset structure should be like this,

Dataset Student_ChatGPT:

col 1 - Student (comes from 1 to 2 problem topics)

col 2 - ChatGPT (same as Student, comes from 1 to 2 problem topics)

---

Dataset Only_ChatGPT

col 1 - ChatGPT (Knowledge of the material for 1 year or 2 semesters)

In [None]:
# Load both datasets
student_chatgpt = pd.read_csv("student_chatgpt.csv")
only_chatgpt = pd.read_csv("only_chatgpt.csv")

In [None]:
print("\nDataset Info:")
print(f"Student-ChatGPT shape: {student_chatgpt.shape}")
print(f"Only-ChatGPT shape: {only_chatgpt.shape}")

## 3. Preprocessing Data

In [None]:
# Function to preprocess text
def preprocess_text(text):
    """Convert text to lowercase and segment into sentences"""
    text = str(text).lower()  # Convert to lowercase
    sentences = sent_tokenize(text)  # Split into sentences
    return sentences

# Process student essays
student_sentences = []
student_labels = []
for text in student_chatgpt['student']:
    sentences = preprocess_text(text)
    student_sentences.extend(sentences)
    student_labels.extend([0] * len(sentences))  # 0 for student

# Process ChatGPT responses from student_chatgpt dataset
chatgpt_sentences_1 = []
chatgpt_labels_1 = []
for text in student_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_1.extend(sentences)
    chatgpt_labels_1.extend([1] * len(sentences))  # 1 for ChatGPT

# Process only_chatgpt responses
chatgpt_sentences_2 = []
chatgpt_labels_2 = []
for text in only_chatgpt['chatgpt']:
    sentences = preprocess_text(text)
    chatgpt_sentences_2.extend(sentences)
    chatgpt_labels_2.extend([1] * len(sentences))

print(f"Number of student sentences: {len(student_sentences)}")
print(f"Number of ChatGPT sentences (from student_chatgpt): {len(chatgpt_sentences_1)}")
print(f"Number of ChatGPT sentences (from only_chatgpt): {len(chatgpt_sentences_2)}")

In [None]:
def extract_stylometric_features(text):
    """
    Extract stylometric features from text:
    1. Basic features (character count, word count)
    2. Lexical features (avg word length, unique words ratio)
    3. Syntactic features (punctuation, uppercase ratio)
    4. POS tag features (noun ratio, verb ratio)
    """
    # Basic features
    n_chars = len(text)
    n_words = len(text.split())
    
    # Lexical features
    avg_word_length = n_chars / n_words if n_words > 0 else 0
    unique_words = len(set(text.split())) / n_words if n_words > 0 else 0
    
    # Syntactic features
    punctuation_marks = len(re.findall(r'[.,!?;:]', text)) / n_chars
    uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars
    
    # POS features
    pos_tags = pos_tag(word_tokenize(text))
    pos_counts = Counter(tag for word, tag in pos_tags)
    noun_ratio = pos_counts.get('NN', 0) / len(pos_tags) if pos_tags else 0
    verb_ratio = sum(pos_counts.get(tag, 0) for tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) / len(pos_tags) if pos_tags else 0
    
    return {
        'avg_word_length': avg_word_length,
        'unique_words_ratio': unique_words,
        'punctuation_ratio': punctuation_marks,
        'uppercase_ratio': uppercase_ratio,
        'noun_ratio': noun_ratio,
        'verb_ratio': verb_ratio,
        'sentence_length': n_words
    }

print("Extracting stylometric features...")

# Extract features for each dataset
student_features = [extract_stylometric_features(sent) for sent in student_sentences]
student_features_array = np.array([[feat[k] for k in feat.keys()] for feat in student_features])

chatgpt1_features = [extract_stylometric_features(sent) for sent in chatgpt_sentences_1]
chatgpt1_features_array = np.array([[feat[k] for k in feat.keys()] for feat in chatgpt1_features])

chatgpt2_features = [extract_stylometric_features(sent) for sent in chatgpt_sentences_2]
chatgpt2_features_array = np.array([[feat[k] for k in feat.keys()] for feat in chatgpt2_features])

## 4. Inisialisasi BERT Tokenizer

In [None]:
# Initialize tokenizer for IndoBERT
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Function to tokenize text
def tokenize_text(texts, max_length=512):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="tf"
    )

## 5. Membuat Model BERT Bi-Encoder

In [None]:
# Load model dasar IndoBERT
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p2")

# Arsitektur untuk Bi-Encoder ChatGPT
input_ids_chatgpt = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask_chatgpt = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

# Dapatkan embeddings dari BERT
bert_outputs_chatgpt = bert_model(input_ids_chatgpt, attention_mask=attention_mask_chatgpt)[0]
cls_token_chatgpt = bert_outputs_chatgpt[:, 0, :]  # Ambil token [CLS]

# Layer-layer tambahan untuk fine-tuning
dense1_chatgpt = tf.keras.layers.Dense(256, activation="relu")(cls_token_chatgpt)
dropout1_chatgpt = tf.keras.layers.Dropout(0.1)(dense1_chatgpt)
dense2_chatgpt = tf.keras.layers.Dense(128, activation="relu")(dropout1_chatgpt)
dropout2_chatgpt = tf.keras.layers.Dropout(0.1)(dense2_chatgpt)
output_chatgpt = tf.keras.layers.Dense(128)(dropout2_chatgpt)

# Normalisasi output
normalized_output_chatgpt = tf.nn.l2_normalize(output_chatgpt, axis=1)

# Buat model Bi-Encoder untuk ChatGPT
bi_encoder_chatgpt = tf.keras.Model(
    inputs=[input_ids_chatgpt, attention_mask_chatgpt],
    outputs=normalized_output_chatgpt,
    name="bi_encoder_chatgpt"
)

In [None]:
# Arsitektur untuk Bi-Encoder Student (struktur yang sama, variable berbeda)
input_ids_student = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="input_ids")
attention_mask_student = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name="attention_mask")

bert_outputs_student = bert_model(input_ids_student, attention_mask=attention_mask_student)[0]
cls_token_student = bert_outputs_student[:, 0, :]

dense1_student = tf.keras.layers.Dense(256, activation="relu")(cls_token_student)
dropout1_student = tf.keras.layers.Dropout(0.1)(dense1_student)
dense2_student = tf.keras.layers.Dense(128, activation="relu")(dropout1_student)
dropout2_student = tf.keras.layers.Dropout(0.1)(dense2_student)
output_student = tf.keras.layers.Dense(128)(dropout2_student)

normalized_output_student = tf.nn.l2_normalize(output_student, axis=1)

bi_encoder_student = tf.keras.Model(
    inputs=[input_ids_student, attention_mask_student],
    outputs=normalized_output_student,
    name="bi_encoder_student"
)

optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5)

## 6. Train Bi-Encoder

Problem resolved

In [None]:
# Implementasi contrastive loss
def contrastive_loss(margin=1.0):
    def loss(y_true, y_pred):
        # y_true: 1 untuk similar pairs, 0 untuk dissimilar pairs
        # y_pred: cosine similarity antara dua embeddings
        
        # Konversi similarity ke distance
        distance = 1 - y_pred
        
        # Loss untuk similar pairs
        positive_loss = y_true * tf.square(distance)
        
        # Loss untuk dissimilar pairs
        negative_loss = (1 - y_true) * tf.square(tf.maximum(margin - distance, 0))
        
        return tf.reduce_mean(positive_loss + negative_loss)
    return loss

In [None]:
# Stage 1: Training bi_encoder_chatgpt dengan data only_chatgpt

# Persiapkan data untuk training
print("Mempersiapkan data training untuk ChatGPT Bi-Encoder...")

# Tokenisasi data ChatGPT
chatgpt2_train_encodings = tokenize_text(chatgpt_sentences_2)

# Buat positive pairs dari kalimat yang berdekatan
batch_size = 32

# Compile model dengan contrastive loss
bi_encoder_chatgpt.compile(
    optimizer=optimizer,
    loss=contrastive_loss(margin=1.0)
)

# Training
print("Training Bi-Encoder ChatGPT...")
history_chatgpt = bi_encoder_chatgpt.fit(
    {
        "input_ids": chatgpt2_train_encodings['input_ids'],
        "attention_mask": chatgpt2_train_encodings['attention_mask']
    },
    # Target matrix: diagonal adalah 1 (similar pairs), sisanya 0
    tf.eye(len(chatgpt_sentences_2)),
    epochs=3,
    batch_size=batch_size,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ]
)

In [None]:
# Stage 2: Training bi_encoder_student dengan data student_chatgpt

# Persiapkan data training
print("Mempersiapkan data training untuk Student Bi-Encoder...")

# Tokenisasi data Student dan ChatGPT
student_train_encodings = tokenize_text(student_sentences)
chatgpt1_train_encodings = tokenize_text(chatgpt_sentences_1)

# Combine data untuk training
combined_input_ids = tf.concat([
    student_train_encodings['input_ids'],
    chatgpt1_train_encodings['input_ids']
], axis=0)

combined_attention_mask = tf.concat([
    student_train_encodings['attention_mask'],
    chatgpt1_train_encodings['attention_mask']
], axis=0)

# Buat similarity matrix
total_samples = len(student_sentences) + len(chatgpt_sentences_1)
similarity_matrix = tf.zeros((total_samples, total_samples))

# Set similarity 1 untuk pasangan dari sumber yang sama
student_size = len(student_sentences)
similarity_matrix = tf.tensor_scatter_nd_update(
    similarity_matrix,
    tf.where(tf.eye(student_size) > 0),
    tf.ones(student_size)
)

chatgpt_start = student_size
chatgpt_size = len(chatgpt_sentences_1)
similarity_matrix = tf.tensor_scatter_nd_update(
    similarity_matrix,
    tf.where(tf.eye(chatgpt_size) > 0) + chatgpt_start,
    tf.ones(chatgpt_size)
)

# Compile model
bi_encoder_student.compile(
    optimizer=optimizer,
    loss=contrastive_loss(margin=1.0)
)

# Training
print("Training Bi-Encoder Student...")
history_student = bi_encoder_student.fit(
    {
        "input_ids": combined_input_ids,
        "attention_mask": combined_attention_mask
    },
    similarity_matrix,
    epochs=3,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ]
)

In [None]:
# Plot training history untuk kedua model
plt.figure(figsize=(15, 5))

# Plot untuk model ChatGPT
plt.subplot(1, 2, 1)
plt.plot(history_chatgpt.history['loss'], label='Training Loss (ChatGPT)')
plt.plot(history_chatgpt.history['val_loss'], label='Validation Loss (ChatGPT)')
plt.title('Training History - Bi-Encoder ChatGPT')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot untuk model Student
plt.subplot(1, 2, 2)
plt.plot(history_student.history['loss'], label='Training Loss (Student)')
plt.plot(history_student.history['val_loss'], label='Validation Loss (Student)')
plt.title('Training History - Bi-Encoder Student')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

## 8. Evaluasi Bi-Encoder

In [None]:
# Siapkan data test untuk evaluasi
# Gunakan sebagian kecil data yang belum digunakan dalam training
test_student = student_sentences[-100:]  # Ambil 100 kalimat terakhir untuk testing
test_chatgpt_1 = chatgpt_sentences_1[-100:]  # Dari student_chatgpt dataset
test_chatgpt_2 = chatgpt_sentences_2[-100:]  # Dari only_chatgpt dataset

# Tokenisasi data test
test_student_encodings = tokenize_text(test_student)
test_chatgpt1_encodings = tokenize_text(test_chatgpt_1)
test_chatgpt2_encodings = tokenize_text(test_chatgpt_2)

# Generate embeddings untuk data test
print("Generating test embeddings...")
test_student_embeddings = bi_encoder_student.predict({
    "input_ids": test_student_encodings['input_ids'],
    "attention_mask": test_student_encodings['attention_mask']
})

test_chatgpt1_embeddings = bi_encoder_student.predict({
    "input_ids": test_chatgpt1_encodings['input_ids'],
    "attention_mask": test_chatgpt1_encodings['attention_mask']
})

test_chatgpt2_embeddings = bi_encoder_chatgpt.predict({
    "input_ids": test_chatgpt2_encodings['input_ids'],
    "attention_mask": test_chatgpt2_encodings['attention_mask']
})

# Hitung similarity scores
def calculate_similarity_metrics(embeddings1, embeddings2):
    """
    Menghitung similarity scores antara dua set embeddings
    Returns: mean similarity, min similarity, max similarity
    """
    # Normalisasi embeddings
    normalized_emb1 = tf.nn.l2_normalize(embeddings1, axis=1)
    normalized_emb2 = tf.nn.l2_normalize(embeddings2, axis=1)
    
    # Hitung similarity matrix
    similarity_matrix = tf.matmul(normalized_emb1, normalized_emb2, transpose_b=True)
    
    # Ambil metrics
    mean_sim = tf.reduce_mean(similarity_matrix)
    max_sim = tf.reduce_max(similarity_matrix)
    min_sim = tf.reduce_min(similarity_matrix)
    
    return mean_sim.numpy(), min_sim.numpy(), max_sim.numpy()

# Evaluasi similarity antara different types of text
print("\nEvaluasi Similarity Scores:")

# Student vs Student
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_student_embeddings[:50], test_student_embeddings[50:])
print("\nStudent vs Student Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

# Student vs ChatGPT (from student_chatgpt)
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_student_embeddings, test_chatgpt1_embeddings)
print("\nStudent vs ChatGPT (student_chatgpt) Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

# ChatGPT vs ChatGPT (across datasets)
mean_sim, min_sim, max_sim = calculate_similarity_metrics(
    test_chatgpt1_embeddings, test_chatgpt2_embeddings)
print("\nChatGPT vs ChatGPT (across datasets) Similarity:")
print(f"Mean: {mean_sim:.3f}, Min: {min_sim:.3f}, Max: {max_sim:.3f}")

## 9. Simpan Model Bi-Encoder

In [None]:
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model Bi-Encoder untuk student_chatgpt
print("Menyimpan model Bi-Encoder student_chatgpt...")
bi_encoder_student.save('saved_models/bi_encoder_student_chatgpt.h5')

# Simpan model Bi-Encoder untuk only_chatgpt
print("Menyimpan model Bi-Encoder only_chatgpt...")
bi_encoder_chatgpt.save('saved_models/bi_encoder_only_chatgpt.h5')

# Simpan tokenizer configuration
tokenizer.save_pretrained('saved_models/tokenizer')

print("Model dan tokenizer berhasil disimpan!")

## 10. Generate Embeddings

In [None]:
print("Generating embeddings for all sentences...")

# Generate embeddings untuk student sentences
print("\nGenerating student embeddings...")
student_encodings = tokenize_text(student_sentences)
student_embeddings = bi_encoder_student.predict({
    "input_ids": student_encodings['input_ids'],
    "attention_mask": student_encodings['attention_mask']
})

# Generate embeddings untuk ChatGPT dari student_chatgpt
print("Generating ChatGPT (student_chatgpt) embeddings...")
chatgpt1_encodings = tokenize_text(chatgpt_sentences_1)
chatgpt1_embeddings = bi_encoder_student.predict({
    "input_ids": chatgpt1_encodings['input_ids'],
    "attention_mask": chatgpt1_encodings['attention_mask']
})

# Generate embeddings untuk ChatGPT dari only_chatgpt
print("Generating ChatGPT (only_chatgpt) embeddings...")
chatgpt2_encodings = tokenize_text(chatgpt_sentences_2)
chatgpt2_embeddings = bi_encoder_chatgpt.predict({
    "input_ids": chatgpt2_encodings['input_ids'],
    "attention_mask": chatgpt2_encodings['attention_mask']
})



In [None]:
# Fungsi untuk mencari similar sentences
def get_top_k_similar(query_embedding, target_embeddings, k=50):
    """
    Mencari k kalimat yang paling mirip berdasarkan cosine similarity
    """
    query_normalized = tf.nn.l2_normalize(query_embedding, axis=1)
    target_normalized = tf.nn.l2_normalize(target_embeddings, axis=1)
    
    similarity_scores = tf.matmul(query_normalized, target_normalized, transpose_b=True)
    scores, indices = tf.nn.top_k(similarity_scores[0], k=k)
    
    return indices.numpy(), scores.numpy()

# List untuk menyimpan similar pairs
similar_pairs = []

# Proses setiap kalimat student
for idx, student_embedding in enumerate(student_embeddings):
    print(f"\rMemproses kalimat ke-{idx+1}/{len(student_embeddings)}", end="")
    
    query_emb = tf.reshape(student_embedding, (1, -1))
    
    # Cari similar sentences dari data student
    student_indices, student_scores = get_top_k_similar(
        query_emb, student_embeddings, k=50
    )
    
    # Cari similar sentences dari data ChatGPT
    combined_chatgpt_embeddings = np.vstack([chatgpt1_embeddings, chatgpt2_embeddings])
    chatgpt_indices, chatgpt_scores = get_top_k_similar(
        query_emb, combined_chatgpt_embeddings, k=50
    )
    
    # Tambahkan ke similar pairs
    for i, score in zip(student_indices, student_scores):
        if i != idx:
            similar_pairs.append({
                'input_sentence': student_sentences[idx],
                'similar_sentence': student_sentences[i],
                'similarity_score': score,
                'label': 0
            })
    
    for i, score in zip(chatgpt_indices, chatgpt_scores):
        if i < len(chatgpt_sentences_1):
            similar_sentence = chatgpt_sentences_1[i]
        else:
            similar_sentence = chatgpt_sentences_2[i - len(chatgpt_sentences_1)]
            
        similar_pairs.append({
            'input_sentence': student_sentences[idx],
            'similar_sentence': similar_sentence,
            'similarity_score': score,
            'label': 1
        })

print("\n\nStatistik similar pairs:")
print(f"Total pasangan: {len(similar_pairs)}")
print(f"Pasangan student-student: {sum(1 for pair in similar_pairs if pair['label'] == 0)}")
print(f"Pasangan student-chatgpt: {sum(1 for pair in similar_pairs if pair['label'] == 1)}")

# Simpan similar pairs
with open('saved_models/similar_pairs.pkl', 'wb') as f:
    pickle.dump(similar_pairs, f)

## 11. Membuat Model BERT Single Sentence Classifier

In [None]:
# 1. Embeddings dari Bi-Encoder
# 2. Fitur stylometric
# 3. Similarity scores dari similar sentences

# %%
print("Membuat model Single Sentence Classifier...")

# Input layers untuk tiga jenis fitur
bert_embedding_input = tf.keras.layers.Input(
    shape=(128,), 
    dtype=tf.float32, 
    name="bert_embedding"
)
stylometric_input = tf.keras.layers.Input(
    shape=(7,), 
    dtype=tf.float32, 
    name="stylometric_features"
)
similarity_score_input = tf.keras.layers.Input(
    shape=(1,), 
    dtype=tf.float32, 
    name="similarity_score"
)

# Processing BERT embeddings
bert_dense = tf.keras.layers.Dense(
    256, 
    activation="relu", 
    name="bert_dense"
)(bert_embedding_input)
bert_dropout = tf.keras.layers.Dropout(0.2)(bert_dense)

# Processing stylometric features
style_dense = tf.keras.layers.Dense(
    32, 
    activation="relu", 
    name="style_dense"
)(stylometric_input)
style_dropout = tf.keras.layers.Dropout(0.2)(style_dense)

# Processing similarity score
sim_dense = tf.keras.layers.Dense(
    16, 
    activation="relu", 
    name="sim_dense"
)(similarity_score_input)
sim_dropout = tf.keras.layers.Dropout(0.2)(sim_dense)

# Combine all features
combined_features = tf.keras.layers.Concatenate()(
    [bert_dropout, style_dropout, sim_dropout]
)

# Classification layers
dense1 = tf.keras.layers.Dense(128, activation="relu")(combined_features)
dropout1 = tf.keras.layers.Dropout(0.2)(dense1)
dense2 = tf.keras.layers.Dense(64, activation="relu")(dropout1)
dropout2 = tf.keras.layers.Dropout(0.2)(dense2)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout2)

# Create classifier model
classifier = tf.keras.Model(
    inputs=[
        bert_embedding_input, 
        stylometric_input, 
        similarity_score_input
    ],
    outputs=output,
    name="single_sentence_classifier"
)

# Compile model dengan metrics yang lengkap
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="binary_crossentropy",
    metrics=[
        "accuracy",
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc")
    ]
)

## 12. Membuat Data Training

In [None]:
print("Menyiapkan data training untuk classifier...")

# Persiapkan data training dari similar pairs
training_data = []
training_labels = []

for pair in similar_pairs:
    # Ambil embedding dari kalimat input
    input_sent_idx = student_sentences.index(pair['input_sentence'])
    input_embedding = student_embeddings[input_sent_idx]
    
    # Ambil fitur stylometric
    input_style_features = student_features_array[input_sent_idx]
    
    # Similarity score dari bi-encoder
    similarity_score = pair['similarity_score']
    
    # Tambahkan ke training data
    training_data.append({
        'bert_embedding': input_embedding,
        'stylometric_features': input_style_features,
        'similarity_score': similarity_score
    })
    training_labels.append(pair['label'])

# Konversi ke format numpy arrays
train_embeddings = np.array([x['bert_embedding'] for x in training_data])
train_style_features = np.array([x['stylometric_features'] for x in training_data])
train_sim_scores = np.array([x['similarity_score'] for x in training_data]).reshape(-1, 1)
train_labels = np.array(training_labels)

# Split data menjadi training dan validation
train_idx, val_idx = train_test_split(
    np.arange(len(training_data)),
    test_size=0.2,
    random_state=42,
    stratify=train_labels
)

# Prepare training inputs
train_inputs = {
    'bert_embedding': train_embeddings[train_idx],
    'stylometric_features': train_style_features[train_idx],
    'similarity_score': train_sim_scores[train_idx]
}

val_inputs = {
    'bert_embedding': train_embeddings[val_idx],
    'stylometric_features': train_style_features[val_idx],
    'similarity_score': train_sim_scores[val_idx]
}

train_labels_split = train_labels[train_idx]
val_labels_split = train_labels[val_idx]

## 13. Train Model

In [None]:
# Training dengan monitoring
history = classifier.fit(
    train_inputs,
    train_labels_split,
    validation_data=(val_inputs, val_labels_split),
    epochs=5,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ]
)

# Plot training history
plt.figure(figsize=(15, 5))

# Plot loss
plt.subplot(1, 3, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 3, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot additional metrics
plt.subplot(1, 3, 3)
plt.plot(history.history['precision'], label='Precision')
plt.plot(history.history['recall'], label='Recall')
plt.plot(history.history['auc'], label='AUC')
plt.title('Additional Metrics')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.legend()

plt.tight_layout()
plt.show()

## 14. Evaluasi Model

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

print("Melakukan evaluasi model...")

# Prediksi pada validation set
val_predictions = classifier.predict(val_inputs)
val_predictions_binary = (val_predictions > 0.5).astype(int)

# Buat confusion matrix
cm = confusion_matrix(val_labels_split, val_predictions_binary)

# Visualisasi confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['Student', 'ChatGPT'],
    yticklabels=['Student', 'ChatGPT']
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Tampilkan classification report lengkap
print("\nClassification Report:")
print(classification_report(
    val_labels_split, 
    val_predictions_binary,
    target_names=['Student', 'ChatGPT']
))

# %% [markdown]
# # 12. Analisis Error Cases
# Menganalisis kasus-kasus di mana model membuat kesalahan untuk pemahaman lebih dalam

# %%
# Identifikasi error cases
errors = {
    'false_positives': [],
    'false_negatives': []
}

for idx, (true_label, pred_label) in enumerate(zip(val_labels_split, val_predictions_binary)):
    if true_label != pred_label:
        # Ambil informasi tentang kasus error
        error_case = {
            'true_label': true_label,
            'predicted_prob': val_predictions[idx][0],
            'stylometric_features': val_inputs['stylometric_features'][idx],
            'similarity_score': val_inputs['similarity_score'][idx][0]
        }
        
        if true_label == 0 and pred_label == 1:
            errors['false_positives'].append(error_case)
        else:
            errors['false_negatives'].append(error_case)

print("Analisis Error Cases:")
print(f"Total False Positives: {len(errors['false_positives'])}")
print(f"Total False Negatives: {len(errors['false_negatives'])}")

# Visualisasi distribusi confidence scores untuk error cases
plt.figure(figsize=(10, 5))

# Plot untuk False Positives
plt.subplot(1, 2, 1)
plt.hist([case['predicted_prob'] for case in errors['false_positives']], bins=20)
plt.title('Confidence Distribution - False Positives')
plt.xlabel('Predicted Probability')
plt.ylabel('Count')

# Plot untuk False Negatives
plt.subplot(1, 2, 2)
plt.hist([case['predicted_prob'] for case in errors['false_negatives']], bins=20)
plt.title('Confidence Distribution - False Negatives')
plt.xlabel('Predicted Probability')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

## 15. Simpan Model

In [None]:
# Buat direktori jika belum ada
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Simpan model classifier
classifier.save('saved_models/single_sentence_classifier.h5')

# Simpan model bi-encoder
bi_encoder_student.save('saved_models/bi_encoder_student.h5')
bi_encoder_chatgpt.save('saved_models/bi_encoder_chatgpt.h5')

# Simpan konfigurasi tokenizer
tokenizer.save_pretrained('saved_models/tokenizer')

# Simpan parameter normalisasi untuk fitur stylometric
stylometric_params = {
    'feature_names': list(student_features[0].keys()),
    'mean': np.mean(train_style_features, axis=0),
    'std': np.std(train_style_features, axis=0)
}

with open('saved_models/stylometric_params.pkl', 'wb') as f:
    pickle.dump(stylometric_params, f)

print("Model dan konfigurasi berhasil disimpan!")

## 16. Prediction Inference

In [None]:
def predict_text_source(text):
    """
    Fungsi untuk memprediksi sumber teks (Student atau ChatGPT)
    Args:
        text: Teks yang akan diprediksi
    Returns:
        Dictionary berisi hasil prediksi dan confidence score
    """
    # Preprocessing
    sentences = preprocess_text(text)
    
    # Extract stylometric features
    style_features = [extract_stylometric_features(sent) for sent in sentences]
    style_features_array = np.array([[feat[k] for k in stylometric_params['feature_names']] 
                                   for feat in style_features])
    
    # Generate embeddings
    encodings = tokenize_text(sentences)
    embeddings = bi_encoder_student.predict({
        "input_ids": encodings['input_ids'],
        "attention_mask": encodings['attention_mask']
    })
    
    # Get similarity scores
    similarity_scores = []
    for emb in embeddings:
        _, scores = get_top_k_similar(
            emb.reshape(1, -1),
            student_embeddings,  # Bisa diganti dengan kombinasi embeddings
            k=50
        )
        similarity_scores.append(np.mean(scores))
    
    # Prepare inputs for classifier
    inputs = {
        'bert_embedding': embeddings,
        'stylometric_features': style_features_array,
        'similarity_score': np.array(similarity_scores).reshape(-1, 1)
    }
    
    # Get predictions
    predictions = classifier.predict(inputs)
    
    # Return results
    return {
        'predictions': predictions.tolist(),
        'avg_confidence': float(np.mean(predictions)),
        'sentence_count': len(sentences)
    }

# Test fungsi prediksi
test_text = "This is a test sentence. Let's see how it works."
result = predict_text_source(test_text)
print("\nHasil test prediksi:")
print(result)