In [13]:
from glob import glob
from tqdm import tqdm
import json
import re
import pandas as pd
import os

In [2]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")



In [9]:
rejected_words = [
    'kebutuhan',
    'berbeda',
    'bahwa',
    'Kode',
    'kode',
    'nomor',
    'RMXX,XXX',
    'kompleksitas',
    'listrik',
    'jawaban',
    'teknis',
    'berkualitas',
    'mencoba',
    'kampanye',
    'komunitas',
    'stabilitas',
    'Stabilitas',
    'metode',
    'pria',
    'butuh',
    'jadwal',
    'kasus',
    'otomatis',
    'populer',
    'bisnis',
    'probabilitas',
    'rusak',
    'kapasitas',
    'rutinitas',
    'pertama-tama'
]
rejected_words = set(rejected_words)

def check_indon(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    return len(set(string.split()) & rejected_words) > 0

In [11]:
df = pd.read_parquet('glaive_dataset_recreated.parquet')
instructions = df['instruction'].tolist()
len(instructions)

136109

In [19]:
data = []
for i in range(len(instructions)):
    filename = f'glaive_coder_raw_text/{i}.json'
    if not os.path.exists(filename):
        continue
    
    try:
        with open(filename) as fopen:
            d = json.load(fopen)
    except:
        continue
    
    d['instruction_en'] = instructions[i]
    data.append(d)

In [20]:
len(data)

131900

In [21]:
rejected_instructions = [
    'tidak dapat melakukan penulisan semula', 
    'maaf', 
    'not able to fulfill your request',
    'saya tidak dapat membantu',
    'dengan permintaan itu',
    'feel free to ask',
    'saya tidak bisa',
]

In [24]:
count, indon_count = 0, 0

indon_texts = []
for d in data:

    if d['output'] is None:
        continue
        
    if any([r in d['instruction'].lower() for r in rejected_instructions]):
        continue

    indon_texts.append(d['instruction'])
    indon_texts.append(d['output'])

    count += 1

In [23]:
indon_texts = set(indon_texts)
indon_texts = [t for t in indon_texts if len(t)]
len(indon_texts)

260002

In [25]:
with open('glaive_coder_raw_text.texts', 'w') as fopen:
    for t in indon_texts:
        fopen.write(f'{json.dumps(t)}\n')

In [26]:
!cp glaive_coder_raw_text.texts ~/ssd3/ctranslate2

In [29]:
mapping = {}
with open('/home/husein/ssd3/ctranslate2/glaive_coder_raw_text.texts.requested') as fopen:
    for l in fopen:
        try:
            l = json.loads(l)
            mapping[l['src']] = l['r']
        except:
            pass
        
len(mapping)

259998

In [31]:
data[0]

{'instruction': 'Bagaimana cara saya menghasilkan teks tebal dalam Bash? Saya mempunyai skrip Bash yang mencetak beberapa teks ke skrin menggunakan perintah `echo "Beberapa Teks"`. Adakah cara untuk memformat teks tersebut supaya ia menjadi tebal?',
 'output': 'Anda boleh menggunakan perintah `echo -e` untuk memformat teks tersebut supaya menjadi tebal. Contohnya, `echo -e "\\033[1mTeks Tebal\\033[0m"`. Dalam contoh ini, `\\033[1m` akan membuat teks menjadi tebal, manakala `\\033[0m` digunakan untuk menetapkan semula pemformatan teks. Semoga membantu!',
 'instruction_en': 'How can I output bold text in Bash? I have a Bash script that prints some text to the screen using the `echo "Some Text"` command. Is there a way I can format the text to make it bold?'}

In [34]:
with open('synthetic-glaive_coder_raw_text.jsonl', 'w') as fopen_l:
    for d in data:
            
        if d['output'] is None:
            continue
        
        output_ms = mapping.get(d['output'])
        instruction_ms = mapping.get(d['instruction'])
        
        indon_ins = False
        if check_indon(d['instruction']):
            indon_ins = True
        
        indon_output = False
        if check_indon(d['output']):
            indon_output = True
        
        rejected_ins = False
        if 1024 <= len(enc.encode(d['instruction'])) <= 1025:
            rejected_ins = True
            
        rejected_output = False
            
        d['indon_ins'] = indon_ins
        d['indon_output'] = indon_output
        d['instruction_ms'] = instruction_ms
        d['output_ms'] = output_ms
        d['rejected_ins'] = rejected_ins
        d['rejected_output'] = rejected_output
        
        fopen_l.write(f'{json.dumps(d)}\n')
        count += 1

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='synthetic-glaive_coder_raw_text.jsonl',
    path_in_repo='synthetic-glaive_coder_raw_text.jsonl',
    repo_id='mesolitica/chatgpt-malay-instructions',
    repo_type='dataset',
)

synthetic-glaive_coder_raw_text.jsonl:   0%|          | 0.00/460M [00:00<?, ?B/s]