In [None]:
# ------------------------------------------------------------
# 📒 1. Persiapan: impor pustaka & set path
# ------------------------------------------------------------
import os, pickle, gc, json, tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertModel

SEN_DIR = "ta_sentence_2"
PAR_DIR = "ta_paragraph_2"

tok_sen_path   = os.path.join(SEN_DIR, "tokenizer")
tok_par_path   = os.path.join(PAR_DIR, "tokenizer")
sem_sen_model  = os.path.join(SEN_DIR, "semantic_model.h5")
sem_par_model  = os.path.join(PAR_DIR, "semantic_model.h5")

tokdata_sen_pkl = os.path.join(SEN_DIR, "tokenized_data.pkl")
tokdata_par_pkl = os.path.join(PAR_DIR, "tokenized_data.pkl")

sen_ref_pkl = os.path.join(SEN_DIR, "reference_embeddings.pkl")
par_ref_pkl = os.path.join(PAR_DIR, "reference_embeddings.pkl")


In [None]:
# ------------------------------------------------------------
# 📒 2. Load tokenizer & model
# ------------------------------------------------------------
print("Memuat tokenizer & model…")
tokenizer_sen = BertTokenizer.from_pretrained(tok_sen_path)
tokenizer_par = BertTokenizer.from_pretrained(tok_par_path)

# Karena model berisi layer custom TFBertModel, daftarkan pada scope
with tf.keras.utils.custom_object_scope({'TFBertModel': TFBertModel}):
    semantic_sen = tf.keras.models.load_model(sem_sen_model)
    semantic_par = tf.keras.models.load_model(sem_par_model)


In [None]:
def gen_emb(tokens, model, batch_size=32):
        """
        Optimized embedding generation with larger batch size and better memory management.
        """
        
        embeddings = []
        num_samples = len(tokens['input_ids'])
        
        # Use larger batch size for better GPU utilization
        for i in range(0, num_samples, batch_size):
            end_idx = min(i + batch_size, num_samples)
            
            batch_input_ids = tokens['input_ids'][i:end_idx]
            batch_attention_mask = tokens['attention_mask'][i:end_idx]
            
            # Generate embeddings for batch
            batch_embeddings = model([batch_input_ids, batch_attention_mask])
            embeddings.append(batch_embeddings.numpy())
            
            # Clear memory periodically
            if i % (batch_size * 10) == 0:
                tf.keras.backend.clear_session()
        
        result = np.concatenate(embeddings, axis=0)
        return result

In [None]:
# ------------------------------------------------------------
# 📒 4. Load tokenized_data.pkl & hitung embedding
# ------------------------------------------------------------
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

print("Memuat tokenized_data…")
tokdata_sen = load_pickle(tokdata_sen_pkl)
tokdata_par = load_pickle(tokdata_par_pkl)

# -------- Sentence level --------
print("⏳ Menghitung embedding kalimat…")
emb_std_sen  = gen_emb(tokdata_sen["student_essay"],     semantic_sen)
emb_gpt1_sen = gen_emb(tokdata_sen["chatgpt_essay"],     semantic_sen)
emb_gpt2_sen = gen_emb(tokdata_sen["chatgpt_knowledge"], semantic_sen)

# -------- Paragraph level --------
print("⏳ Menghitung embedding paragraf…")
emb_std_par  = gen_emb(tokdata_par["student_essay"],     semantic_par)
emb_gpt1_par = gen_emb(tokdata_par["chatgpt_essay"],     semantic_par)
emb_gpt2_par = gen_emb(tokdata_par["chatgpt_knowledge"], semantic_par)

# Bebaskan VRAM/CPU RAM
del tokdata_sen, tokdata_par
gc.collect()


In [None]:
# ------------------------------------------------------------
# 📒 5. Simpan ke reference_embeddings.pkl
# ------------------------------------------------------------
sen_emb_dict = {
    # sentence level
    "embeddings_std_sen":   emb_std_sen,
    "embeddings_gpt1_sen":  emb_gpt1_sen,
    "embeddings_gpt2_sen":  emb_gpt2_sen
}

par_emb_dict = {
    # paragraph level
    "embeddings_std_par":   emb_std_par,
    "embeddings_gpt1_par":  emb_gpt1_par,
    "embeddings_gpt2_par":  emb_gpt2_par
}


# Simpan embedding kalimat
sen_path = os.path.join(SEN_DIR, "reference_embeddings.pkl")
with open(sen_path, "wb") as f:
    pickle.dump(sen_emb_dict, f)
print(f"✅ Sentence Selesai! File tersimpan di: {sen_path}")

# Simpan embedding paragraf
par_path = os.path.join(PAR_DIR, "reference_embeddings.pkl")
with open(par_path, "wb") as f:
    pickle.dump(par_emb_dict, f)
print(f"✅ Paragraph Selesai! File tersimpan di: {par_path}")



In [None]:
# ------------------------------------------------------------
# 📒 6. (Opsional) Verifikasi cepat
# ------------------------------------------------------------
with open(sen_ref, "rb") as f:
    data = pickle.load(f)
for k, v in data.items():
    print(f"{k:25s}: {v.shape}")

with open(par_ref, "rb") as f:
    data = pickle.load(f)
for k, v in data.items():
    print(f"{k:25s}: {v.shape}")
