In [2]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/extra/translation-instructions.json

In [14]:
import malaysian_sft
from malaysian_sft import accept, post_accept

import unicodedata

def detect_majority_language(text):
    counts = {
        'arabic': 0,
        'chinese': 0,
        'indian': 0,
        'other': 0
    }

    for char in text:
        code = ord(char)

        # Arabic script
        if (0x0600 <= code <= 0x06FF or  # Arabic
            0x0750 <= code <= 0x077F or  # Arabic Supplement
            0x08A0 <= code <= 0x08FF or  # Arabic Extended-A
            0xFB50 <= code <= 0xFDFF or  # Arabic Presentation Forms-A
            0xFE70 <= code <= 0xFEFF):   # Arabic Presentation Forms-B
            counts['arabic'] += 1

        # Chinese (Han characters)
        elif (0x4E00 <= code <= 0x9FFF or  # CJK Unified Ideographs
              0x3400 <= code <= 0x4DBF or  # CJK Unified Ideographs Extension A
              0x20000 <= code <= 0x2A6DF or
              0x2A700 <= code <= 0x2EBEF):
            counts['chinese'] += 1

        # Indian (Devanagari, Tamil, Bengali, etc.)
        elif (0x0900 <= code <= 0x097F or  # Devanagari
              0x0980 <= code <= 0x09FF or  # Bengali
              0x0A80 <= code <= 0x0AFF or  # Gujarati
              0x0B00 <= code <= 0x0B7F or  # Oriya
              0x0B80 <= code <= 0x0BFF or  # Tamil
              0x0C00 <= code <= 0x0C7F or  # Telugu
              0x0C80 <= code <= 0x0CFF or  # Kannada
              0x0D00 <= code <= 0x0D7F or  # Malayalam
              0x0E00 <= code <= 0x0E7F):   # Some scripts like Thai, if needed
            counts['indian'] += 1

        # Other
        else:
            if char.isalpha():
                counts['other'] += 1

    if all(v == 0 for v in counts.values()):
        return 'unknown'

    return max(counts, key=counts.get)

# if 'jawi.parquet' in f:
#     check_arabic = False
#     check_indian = True
#     check_mandarin = True
# elif 'tamil.parquet' in f:
#     check_arabic = True
#     check_indian = False
#     check_mandarin = True
# elif 'mandarin.parquet' in f:
#     check_arabic = True
#     check_indian = True
#     check_mandarin = False
# else:
#     check_arabic = True
#     check_indian = True
#     check_mandarin = True

In [3]:
import json

with open('translation-instructions.json') as fopen:
    d = json.load(fopen)

In [4]:
len(d)

70000

In [15]:
from tqdm import tqdm

filtered = []
for d_ in tqdm(d):
    lang = detect_majority_language(d_['output'])
    if lang == 'arabic':
        check_arabic = False
        check_indian = True
        check_mandarin = True
    elif lang == 'indian':
        check_arabic = True
        check_indian = False
        check_mandarin = True
    elif lang == 'chinese':
        check_arabic = True
        check_indian = True
        check_mandarin = False
    else:
        check_arabic = True
        check_indian = True
        check_mandarin = True
    if not post_accept(
        d_['output'], 
        check_arabic = check_arabic, 
        check_indian = check_indian,
        check_mandarin = check_mandarin,
    ):
        continue
    filtered.append(d_)
        

100%|███████████████████████████████████| 70000/70000 [00:10<00:00, 6402.73it/s]


In [16]:
len(filtered)

69911

In [22]:
filtered[-2]

{'prompt_input': None,
 'input': '`Huruf ke-10 dalam alfabet adalah huruf "J" dalam Kode Morse. Dalam Kode Morse, setiap huruf direpresentasikan dengan serangkaian titik dan garis. Huruf "J" direpresentasikan dengan satu titik, diikuti oleh tiga garis, yang ditransmisikan sebagai "dot dash dash dash" atau ".---" dalam notasi Kode Morse. \n\nUntuk lebih memahami Kode Morse, disarankan untuk belajar mengenali setiap karakter dan polanya, serta cara mengucapkan dan mendengarkan kode tersebut. Ini dapat dilakukan dengan membaca buku atau tutorial online, atau bahkan dengan berlatih menggunakan alat Kode Morse, seperti keyer atau program komputer. Dengan berlatih secara teratur, seseorang dapat menjadi ahli dalam Kode Morse dan menggunakannya untuk berkomunikasi dengan orang lain dalam situasi yang tepat, seperti dalam situasi darurat atau selama aktivitas radio.` terjemah ke malay',
 'output': 'Huruf ke-10 dalam abjad ialah huruf "J" dalam Kod Morse. Dalam Kod Morse, setiap huruf diwakili 

In [18]:
with open('translation-instructions-postprocessing.json', 'w') as fopen:
    json.dump(filtered, fopen)

In [19]:
from huggingface_hub import HfApi
api = HfApi()

In [20]:
api.upload_file(
    path_or_fileobj="translation-instructions-postprocessing.json",
    path_in_repo="extra/translation-instructions-postprocessing.json",
    repo_id="mesolitica/Malaysian-SFT",
    repo_type="dataset",
)

translation-instructions-postprocessing.json:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-SFT/commit/a7e5bf576ac73395af9c30893f2993567890a03d', commit_message='Upload extra/translation-instructions-postprocessing.json with huggingface_hub', commit_description='', oid='a7e5bf576ac73395af9c30893f2993567890a03d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-SFT', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-SFT'), pr_revision=None, pr_num=None)