In [1]:
!pip install fasttext
!wget -q https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin



In [2]:
import re
import os
import logging
import random
from collections import Counter
from datasets import load_dataset, DatasetDict
import fasttext

# ======================= KONFIGURASI =============================
OUTPUT_DIR = "/kaggle/working/One2Set/data"
SEED = 42
TRAIN_SAMPLE_SIZE = 30000
FASTTEXT_MODEL_PATH = "lid.176.bin"
# ================================================================

# Memuat model deteksi bahasa FastText
lang_model = fasttext.load_model(FASTTEXT_MODEL_PATH)

def is_english_fast(text):
    try:
        prediction = lang_model.predict(text.strip().replace('\n', ' '))[0][0]
        return prediction == '__label__en'
    except:
        return False

def clean_text(text):
    text = str(text).replace('\n', ' ').strip()
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Hapus karakter non-ASCII
    text = re.sub(r'\s+', ' ', text)
    return text

def replace_digits(text):
    return re.sub(r'\b\d+\b', '<digit>', text)

def tokenize(text):
    return text.lower().strip().split()

def preprocess_and_save():
    logging.getLogger("datasets").setLevel(logging.ERROR)

    # 1. Load & Sampling
    full_dataset = load_dataset("taln-ls2n/kptimes", trust_remote_code=True)
    train_sample = full_dataset['train'].shuffle(seed=SEED).select(range(TRAIN_SAMPLE_SIZE))

    # 2. Siapkan dataset akhir dengan split: train, valid, test
    dataset_sampled = DatasetDict({
        'train': train_sample,
        'valid': full_dataset['validation'],
        'test': full_dataset['test']
    })

    # 3. Buat direktori output
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Menyimpan hasil ke: {OUTPUT_DIR}")

    # 4. Loop tiap split dan simpan .txt
    for split_name, dataset_split in dataset_sampled.items():
        src_path = os.path.join(OUTPUT_DIR, f"{split_name}_src.txt")
        trg_path = os.path.join(OUTPUT_DIR, f"{split_name}_trg.txt")

        total = 0
        dipakai = 0

        with open(src_path, 'w', encoding='utf-8') as f_src, \
             open(trg_path, 'w', encoding='utf-8') as f_trg:

            for example in dataset_split:
                total += 1
                title_raw = example['title']
                abstract_raw = example['abstract']

                if not is_english_fast(title_raw) or not is_english_fast(abstract_raw):
                    continue

                title = replace_digits(clean_text(title_raw))
                abstract = replace_digits(clean_text(abstract_raw))
                source_text = f"{title} <eos> {abstract}"

                MAX_TOKENS = 512
                tokens = source_text.split()
                if len(tokens) > MAX_TOKENS:
                    print(f"⚠ Truncated from {len(tokens)} to {MAX_TOKENS} tokens")
                    tokens = tokens[:MAX_TOKENS]
                source_text = ' '.join(tokens)

                f_src.write(source_text + '\n')

                # Proses target
                keyphrases = example['keyphrases']
                prmu = example['prmu']

                present_kps = []
                absent_kps = []

                for kp, p in zip(keyphrases, prmu):
                    if not is_english_fast(kp):
                        continue
                    kp_clean = replace_digits(clean_text(kp))
                    if kp_clean.strip() == '':
                        continue
                    if p.strip() == 'P':
                        present_kps.append(kp_clean)
                    else:
                        absent_kps.append(kp_clean)

                final_kps = []
                if present_kps:
                    final_kps.extend(present_kps)
                final_kps.append('<peos>')
                if absent_kps:
                    final_kps.extend(absent_kps)

                f_trg.write(';'.join(final_kps) + '\n')

                dipakai += 1

        print(f"Split '{split_name}' selesai: {dipakai}/{total} baris digunakan.")

    # 5. Tampilkan hasil akhir
    print("\nSemua file disimpan di:", OUTPUT_DIR)
    for fname in sorted(os.listdir(OUTPUT_DIR)):
        print(f" - {fname}")

if __name__ == "__main__":
    preprocess_and_save()

📥 Memuat model deteksi bahasa FastText...
🚀 Memulai pra-pemrosesan KPTimes...
📥 Memuat dataset KPTimes dan mengambil sample...


README.md:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

kptimes.py:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/84.7M [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

dev.jsonl:   0%|          | 0.00/50.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

📂 Menyimpan hasil ke: /kaggle/working/One2Set/data
⚠ Truncated from 1693 to 512 tokens
⚠ Truncated from 1928 to 512 tokens
⚠ Truncated from 628 to 512 tokens
⚠ Truncated from 890 to 512 tokens
⚠ Truncated from 519 to 512 tokens
⚠ Truncated from 792 to 512 tokens
⚠ Truncated from 881 to 512 tokens
⚠ Truncated from 906 to 512 tokens
⚠ Truncated from 828 to 512 tokens
⚠ Truncated from 712 to 512 tokens
⚠ Truncated from 815 to 512 tokens
⚠ Truncated from 1383 to 512 tokens
⚠ Truncated from 1335 to 512 tokens
⚠ Truncated from 814 to 512 tokens
⚠ Truncated from 797 to 512 tokens
⚠ Truncated from 1208 to 512 tokens
⚠ Truncated from 794 to 512 tokens
⚠ Truncated from 2198 to 512 tokens
⚠ Truncated from 1168 to 512 tokens
⚠ Truncated from 824 to 512 tokens
⚠ Truncated from 1006 to 512 tokens
⚠ Truncated from 5133 to 512 tokens
⚠ Truncated from 863 to 512 tokens
⚠ Truncated from 901 to 512 tokens
⚠ Truncated from 972 to 512 tokens
⚠ Truncated from 919 to 512 tokens
⚠ Truncated from 1024 to 512 t

In [3]:
def check_substring_in_file(file_path, expected_substring):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    missing_lines = [i+1 for i, line in enumerate(lines) if expected_substring not in line]
    all_ok = len(missing_lines) == 0

    print(f"File: {file_path}")
    if all_ok:
        print(f"Semua baris mengandung substring '{expected_substring}'")
    else:
        print(f"{len(missing_lines)} baris tidak mengandung substring '{expected_substring}': baris {missing_lines}")
    print("-" * 50)


# Daftar file dan substring yang diharapkan
files_and_substrings = {
    "train_src.txt": "<eos>",
    "train_trg.txt": "<peos>",
    "valid_src.txt": "<eos>",
    "valid_trg.txt": "<peos>",
    "test_src.txt": "<eos>",
    "test_trg.txt": "<peos>",
}

# Jalankan pengecekan untuk setiap file
for file_path, substring in files_and_substrings.items():
    check_substring_in_file('/kaggle/working/One2Set/data/'+file_path, substring)


File: /kaggle/working/One2Set/data/train_src.txt
✅ Semua baris mengandung substring '<eos>'
--------------------------------------------------
File: /kaggle/working/One2Set/data/train_trg.txt
✅ Semua baris mengandung substring '<peos>'
--------------------------------------------------
File: /kaggle/working/One2Set/data/valid_src.txt
✅ Semua baris mengandung substring '<eos>'
--------------------------------------------------
File: /kaggle/working/One2Set/data/valid_trg.txt
✅ Semua baris mengandung substring '<peos>'
--------------------------------------------------
File: /kaggle/working/One2Set/data/test_src.txt
✅ Semua baris mengandung substring '<eos>'
--------------------------------------------------
File: /kaggle/working/One2Set/data/test_trg.txt
✅ Semua baris mengandung substring '<peos>'
--------------------------------------------------


In [4]:
def count_stats(src_path, trg_path):
    import numpy as np

    src_lens = []
    trg_counts = []

    with open(src_path, 'r') as f_src, open(trg_path, 'r') as f_trg:
        for src_line, trg_line in zip(f_src, f_trg):
            src_lens.append(len(src_line.strip().split()))
            trg_counts.append(len(trg_line.strip().split(';')))

    print("Statistik Dataset")
    print(f" - Jumlah data: {len(src_lens)}")
    print(f" - Panjang input rata-rata: {np.mean(src_lens):.2f} tokens")
    print(f" - Panjang input maksimal : {np.max(src_lens)} tokens")
    print(f" - Rata-rata keyphrase    : {np.mean(trg_counts):.2f}")
    print(f" - Maksimal keyphrase     : {np.max(trg_counts)}")

# Contoh penggunaan
count_stats("/kaggle/working/One2Set/data/test_src.txt", "/kaggle/working/One2Set/data/test_trg.txt")
count_stats("/kaggle/working/One2Set/data/train_src.txt", "/kaggle/working/One2Set/data/train_trg.txt")
count_stats("/kaggle/working/One2Set/data/valid_src.txt", "/kaggle/working/One2Set/data/valid_trg.txt")

# with open("data/train_src.txt") as f:
#     for i, line in enumerate(f):
#         if len(line.strip().split()) > 512:
#             print(f"Baris {i} terlalu panjang")

📝 Statistik Dataset
 - Jumlah data: 19928
 - Panjang input rata-rata: 424.34 tokens
 - Panjang input maksimal : 512 tokens
 - Rata-rata keyphrase    : 5.20
 - Maksimal keyphrase     : 11
📝 Statistik Dataset
 - Jumlah data: 29807
 - Panjang input rata-rata: 444.21 tokens
 - Panjang input maksimal : 512 tokens
 - Rata-rata keyphrase    : 5.64
 - Maksimal keyphrase     : 11
📝 Statistik Dataset
 - Jumlah data: 9937
 - Panjang input rata-rata: 445.00 tokens
 - Panjang input maksimal : 512 tokens
 - Rata-rata keyphrase    : 5.60
 - Maksimal keyphrase     : 11
