In [1]:
from glob import glob
import json
import random
from tqdm import tqdm
import copy

In [2]:
fopen_l = open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl', 'w')

In [3]:
rejected = [
    'help.openai.com',
    'openassistant'
]

break_at = [
    'help.openai.com',
    'openai',
    'cannot have personal opinions',
    's an ai language model',
    "i'm sorry",
    'many factors',
    'lgbt',
    'lesbian',
    'gender-neutral',
    'remain neutral',
    'without bias',
    'and neutral',
    'more inclusive',
    'neutrality',
    'non-bias',
    'discrimination',
    'avoid any forms of discrimination',
    'regardless of their gender',
    'inclusive and tolerant environment',
    'have personal views',
    'sexual orientation should be a top priority',
    's an objective ai',
    'avoid any forms of prejudice or hate',
    'regardless of their personal',
    'you understand this direction',
    'tolerant environment within ai',
    'cannot express my',
    'requires more context',
    'personal opinion',
    'have updated information',
    "don't have personal experiences",
    'there is no information',
    'tidak mempunyai akses kepada data atau maklumat',
    '10 april 2021',
    'ebagai model bahasa AI',
    'model bahasa AI',
    'mempunyai kepercayaan atau pendapat peribadi',
    'tidak mempunyai pendapat peribadi',
    'tidak mempunyai kepercayaan',
    'tidak mempunyai falsafah peribadi',
    'tidak mempunyai pengalaman peribadi',
    'tidak mempunyai pendapat atau pengalaman peribadi',
    'tidak mempunyai maklumat terkini',
    'tidak mempunyai emosi peribadi',
    'tidak mempunyai keutamaan',
    'saya tidak mempunyai akses',
    'tidak mempunyai pengalaman',
    'saya tidak mempunyai keupayaan',
    'tidak mempunyai keupayaan',
    'tidak mempunyai hubungan',
    'tidak mempunyai maklumat',
    'Saya tidak mempunyai',
    'Saya tidak pernah',
    'saya tidak dapat memahami jawapan',
    '=====',
    '-----',
]

break_at_terjemah = [
    'terjemah',
    'translate'
]

rejected_words = [
    'kebutuhan',
    'berbeda',
    'bahwa',
    'Kode',
    'kode',
    'nomor',
    'RMXX,XXX',
    'kompleksitas',
    'listrik',
    'teknis',
    'berkualitas',
    'mencoba',
    'kampanye',
    'komunitas',
    'stabilitas',
    'Stabilitas',
    'metode',
    'pria',
    'butuh',
    'jadwal',
    'kasus',
    'otomatis',
    'populer',
    'bisnis',
    'probabilitas',
    'rusak',
    'kapasitas',
    'rutinitas',
    'pertama-tama'
]

def found_word(s, words):
    for i in range(len(words)):
        if words[i] in s:
            return True, words[i]
    return False, None

indons = []

def accept(d, min_len = 10, skip_indon = True, skip_translation = True):
    global indons
    
    d = d.strip()
    
    if len(d.split()) < min_len:
        return False
        
    if 'terjemahkan teks' in d.lower():
        return False

    if 'no need to translate' in d.lower():
        return False

    if 'can be translated' in d.lower():
        return False

    if 'cannot translate' in d.lower():
        return False

    if 'should be translated to' in d.lower():
        return False

    if 'cannot be translated' in d.lower():
        return False

    if 'standard malay' in d.lower():
        return False

    if 'would not be translated' in d.lower():
        return False

    if 'as an AI language model' in d:
        return False

    if 'should be translated as' in d.lower():
        return False

    if 'Bahasa Malaysia Standard' in d:
        return False
    
    if 'Saya adalah model AI' in d:
        return False
    
    if 'saya model AI' in d:
        return False
    
    if 'Saya model AI' in d:
        return False
    
    if 'sebagai model AI' in d:
        return False
    
    if 'Sebagai model AI' in d:
        return False
    
    if 'model bahasa AI' in d:
        return False
    
    if 'model AI yang dibangunkan' in d:
        return False
    
    if d == '<s>':
        return False
    
    if 'tidak dapat memberikan maklumat' in d:
        return False
    
    if 'Sebagai model bahasa' in d:
        return False
    
    d_lower = d.lower()
    splitted = d_lower.split()
    if (len(set(splitted)) / len(splitted)) < 0.2:
        return False
    
    if found_word(d_lower, rejected)[0]:
        return False
    
    if found_word(d_lower, break_at)[0]:
        return False
    
    if skip_translation and found_word(d_lower, break_at_terjemah)[0]:
        return False
    
    if skip_indon:
        found_indon = found_word(d_lower, rejected_words)
        if found_indon[0]:
            indons.append((d, found_indon[1]))
            return False
    
    return True

def split(row):
    inputs, outputs = [], []
    splitted = row['input'].split('<bot>:')
    for i in range(len(splitted) - 1):
        if i == 0:
            human = splitted[i].replace('<manusia>:', '')
        else:
            try:
                human = splitted[i].split('<manusia>:')[1]
            except:
                continue
        bot = splitted[i + 1].split('<manusia>:')[0]
        inputs.append(human.strip())
        outputs.append(bot.strip())
    return inputs, outputs

def generate_and_tokenize_prompt(row, min_length_input = 1, min_length_output = 1):
    texts = ['<s>']

    if 'function_call' in row:
        t = row['function_call']
        texts.append(f'\n[FUNCTIONCALL]\n{t}\n')

    if '<bot>:' in row['input'] and row['output'] is None:
        inputs, outputs = [], []
        splitted = row['input'].split('<bot>:')
        for i in range(len(splitted) - 1):
            if i == 0:
                human = splitted[i].replace('<manusia>:', '')
            else:
                try:
                    human = splitted[i].split('<manusia>:')[1]
                except:
                    continue
            bot = splitted[i + 1].split('<manusia>:')[0]
            inputs.append(human.strip())
            outputs.append(bot.strip())
    else:
        inputs = [row['input']]
        outputs = [row['output']]

    for u, a in zip(inputs, outputs):
        u = u.strip()
        a = a.strip()
        if len(u) < min_length_input or len(a) < min_length_output:
            print(u, len(u), a, len(a), row)
            return
        texts.append(f'[INST] {u.strip()} [/INST] {a.strip()}</s> ')

    prompt = ''.join(texts)
    return {'text': prompt}

In [4]:
options = ['A.', 'B.', 'C.', 'D.']
options2 = ['A', 'B', 'C', 'D']
def rejected_output(q, a):
    q_, a_ = [], []
    for o in options:
        q_.append(o in q)
    for o in options2:
        a_.append(o in a)
    return sum(a_) > 1 or sum(q_) < 4

In [5]:
# !wget https://huggingface.co/datasets/malaysia-ai/filtered-aya-dataset-zsm/resolve/main/filtered-aya_dataset-zsm.jsonl

In [6]:
with open('filtered-aya_dataset-zsm.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        data = {
            'prompt_input': None,
            'input': l['inputs'],
            'output': l['targets']
        }
        fopen_l.write(f'{json.dumps(data)}\n')
        fopen_l.flush()

In [7]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

10073 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [8]:
files = [
    'prepared-chatgpt-malay-function-call.jsonl',
    'prepared-function-call-qa-choice.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            if random.random() > 0.2:
                continue
            
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()

179339it [00:01, 126654.26it/s]
13358it [00:00, 36600.89it/s]


In [9]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

48481 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [10]:
files = [
    'prepared-function-call-malaysian-open-qa.jsonl',
    'prepared-function-call-kg.jsonl'
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            if random.random() > 0.05:
                continue
            
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()

157120it [00:02, 59761.17it/s]
104689it [00:00, 105510.76it/s]


In [11]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

61662 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [12]:
files = [
    'prepared-chatgpt-malay-function-call-complex.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()

24984it [00:00, 75358.54it/s]


In [13]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

86646 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [14]:
files = [
    'prepared-synthetic-json-translation.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            if random.random() > 0.03:
                continue
            
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()

1798326it [00:04, 416903.91it/s]


In [15]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

140980 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [16]:
files = [
    'prepared-synthetic-json-sentiment.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            if random.random() > 0.3:
                continue
            
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()

162902it [00:00, 277953.82it/s]


In [17]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

189947 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [18]:
all_data = []
with open('/home/husein/ssd3/one-for-all/jawi-rumi.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        all_data.append(l)
        
for data in random.sample(all_data, 10000):
    fopen_l.write(f'{json.dumps(data)}\n')
    fopen_l.flush()

749797it [00:09, 81935.02it/s] 


In [19]:
all_data = []
with open('/home/husein/ssd3/one-for-all/rumi-jawi.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        all_data.append(l)
        
for data in random.sample(all_data, 10000):
    fopen_l.write(f'{json.dumps(data)}\n')
    fopen_l.flush()

749797it [00:08, 84012.13it/s] 


In [20]:
all_data = []
with open('prepared-mixtral-malaysian-abstractive-summarization.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        all_data.append(l)
        
for data in random.sample(all_data, 20000):
    fopen_l.write(f'{json.dumps(data)}\n')
    fopen_l.flush()

38760it [00:00, 97872.62it/s] 


In [21]:
# git lfs clone https://huggingface.co/datasets/mesolitica/noisy-standard-malay-translation-instructions
# git lfs clone https://huggingface.co/datasets/mesolitica/standard-malay-translation-instructions

translation_files = glob('/home/husein/ssd2/translation/*-instructions.jsonl')
for f in translation_files:
    all_data = []
    with open(f) as fopen:
        for l in tqdm(fopen):
            data = json.loads(l)
            try:
                if not accept(data['input'], skip_indon = False, skip_translation = False):
                    continue
                if not accept(data['output'], min_len = 1, skip_translation = False):
                    continue
                all_data.append(data)
            except:
                pass
    
    all_data = random.sample(all_data, min(len(all_data), 10000))
    for data in all_data:
        fopen_l.write(f'{json.dumps(data)}\n')
        fopen_l.flush()

799733it [00:22, 35481.61it/s]
1036411it [00:21, 47285.91it/s]
1699454it [00:10, 155781.40it/s]
0it [00:00, ?it/s]
141851it [00:02, 64183.97it/s]
333758it [00:15, 22020.33it/s]
799731it [00:22, 35418.59it/s]
740431it [00:22, 32443.52it/s]
0it [00:00, ?it/s]
2187464it [00:48, 44946.49it/s]
631727it [00:19, 32122.84it/s]
2496675it [00:54, 45403.03it/s]


In [22]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

329947 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [23]:
with open('post-translation-instructions.jsonl') as fopen:
    mapping = json.load(fopen)
len(mapping)

25957

In [24]:
augmentation = {}
with open('/home/husein/ssd1/ctranslate2/augmentation-questions.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        augmentation[l['question']] = l['augmentation']
        
len(augmentation)

1689426

In [25]:
files = [
    'prepared-chatgpt-malaysian-general-qa.jsonl',
    'prepared-chatgpt-malaysian-open-qa.jsonl',
    'prepared-chatgpt-malay-instructions.jsonl',
    'prepared-chatgpt4-malaysian-general-qa.jsonl',
    'prepared-chatgpt4-noisy-translation.jsonl',
    'prepared-mixtral-malaysian-general-qa.jsonl',
]

for f in files:
    print(f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = json.loads(l)
                if data['output'] in mapping:
                    data['output'] = mapping[data['output']]
                if not accept(data['input'], skip_indon = False):
                    continue
                if not accept(data['output'], min_len = 1, skip_indon = 'noisy-translation' not in f):
                    continue
                
                if data['input'] in augmentation:
                    d = copy.deepcopy(data)
                    q = random.choice(augmentation[d['input']])
                    d['input'] = q
                
                    fopen_l.write(f'{json.dumps(d)}\n')
                    fopen_l.flush()
                    
                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except Exception as e:
                pass
            
    print(len(indons))

prepared-chatgpt-malaysian-general-qa.jsonl


31236it [00:03, 7856.89it/s] 


60925
prepared-chatgpt-malaysian-open-qa.jsonl


265828it [01:07, 3918.11it/s]


60985
prepared-chatgpt-malay-instructions.jsonl


533562it [01:12, 7385.30it/s] 


63432
prepared-chatgpt4-malaysian-general-qa.jsonl


70258it [00:09, 7346.10it/s] 


63630
prepared-chatgpt4-noisy-translation.jsonl


37674it [00:00, 82439.22it/s]


63630
prepared-mixtral-malaysian-general-qa.jsonl


265851it [00:39, 6724.51it/s] 

63648





In [26]:
!wc -l combine-malay-no-alignment-multitasks-v5.jsonl

2853603 combine-malay-no-alignment-multitasks-v5.jsonl


In [27]:
files = [
    'prepared-kertas-1.jsonl',
    'prepared-synthetic-commonsense.jsonl',
    'prepared-kesalahan-tatabahasa.jsonl',
    'prepared-malaysian-qa-choice.jsonl',
]

for f in files:
    print(f)
    with open(f) as fopen:
        for l in fopen:
            try:
                data = json.loads(l)
                if not accept(data['input'], skip_indon = False):
                    continue
                if not len(data['output'].strip()):
                    continue
                if rejected_output(data['input'], data['output']):
                    continue
                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except Exception as e:
                print(e)
                pass

prepared-kertas-1.jsonl
'NoneType' object has no attribute 'strip'
'NoneType' object has no attribute 'strip'
'NoneType' object has no attribute 'strip'
'NoneType' object has no attribute 'strip'
'NoneType' object has no attribute 'strip'
prepared-synthetic-commonsense.jsonl
prepared-kesalahan-tatabahasa.jsonl
prepared-malaysian-qa-choice.jsonl


In [28]:
!wc -l combine-malay-no-alignment-multitasks-v5.jsonl

2865789 combine-malay-no-alignment-multitasks-v5.jsonl


In [29]:
files = [
    'prepared-metamathqa.jsonl',
]

for f in files:
    print(f)
    with open(f) as fopen:
        for l in fopen:
            try:

                data = json.loads(l)
                
                if data['output'].strip().endswith('ialah:'):
                    continue
                if not accept(data['input'], skip_indon = False):
                    continue
                if not accept(data['output']):
                    continue
                    
                if random.random() < 0.8:
                    continue
                    
                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except:
                pass

prepared-metamathqa.jsonl


In [30]:
files = [
    'prepared-camel-ai.jsonl',
]

for f in files:
    print(f)
    with open(f) as fopen:
        for l in fopen:
            try:

                data = json.loads(l)
                
                if data['output'].strip().endswith('ialah:'):
                    continue
                if not accept(data['input'], skip_indon = False):
                    continue
                if not accept(data['output']):
                    continue
                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except:
                pass

prepared-camel-ai.jsonl


In [31]:
files = [
    'prepared-mixtral-malaysian-rag.jsonl',
    'prepared-mixtral-malaysian-rag-knowledge-base-part2.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):

            if 'tolong terjemah' in l.lower():
                continue

            try:
                data = json.loads(l)
                if not accept(data['input']):
                    continue

                if random.random() > 0.3:
                    continue

                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except:
                pass

591596it [03:23, 2903.86it/s]
167410it [02:02, 1372.12it/s]


In [32]:
files = [
    'prepared-mixtral-malaysian-rag.jsonl',
    'prepared-mixtral-malaysian-rag-knowledge-base-part2.jsonl',
]

for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):

            if 'tolong terjemah' in l.lower():
                continue

            try:
                data = json.loads(l)
                if not accept(data['input']):
                    continue
                    
                if random.random() > 0.2:
                    continue
                
                count = 0
                inputs, outputs = split(data)
                for i in range(len(inputs)):
                    if inputs[i] in augmentation:
                        inputs[i] = random.choice(augmentation[inputs[i]])
                        count += 1
                        
                if count == 0:
                    continue
                        
                chats = []
                for i in range(len(inputs)):
                    chats.extend([
                        f'<manusia>: {inputs[i]}',
                        f'<bot>: {outputs[i]}'
                    ])
                    
                data['input'] = '\n'.join(chats)

                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
            except Exception as e:
                print(e)
                pass

591596it [03:22, 2924.07it/s]
167410it [02:03, 1357.75it/s]


In [33]:
with open('prepared-mixtral-malaysian-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):

        if 'tolong terjemah' in l.lower():
            continue

        try:
            data = json.loads(l)
            if not accept(data['input']):
                continue
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

397786it [02:16, 2919.10it/s] 


In [34]:
with open('prepared-mixtral-malaysian-multiturn.jsonl') as fopen:
    for l in tqdm(fopen):

        if 'tolong terjemah' in l.lower():
            continue

        try:
            data = json.loads(l)
            if not accept(data['input']):
                continue
            
            if 'tambah' in data['input'].lower():
                continue
            
            if '2 + 2' in data['input'].lower():
                continue
                
            if 'anda seorang chatbot' in data['input'].lower():
                continue
                
            if random.random() > 0.2:
                continue
            
            count = 0
            inputs, outputs = split(data)
            for i in range(len(inputs)):
                if inputs[i] in augmentation:
                    inputs[i] = random.choice(augmentation[inputs[i]])
                    count += 1
                    
            if count == 0:
                continue

            chats = []
            for i in range(len(inputs)):
                chats.extend([
                    f'<manusia>: {inputs[i]}',
                    f'<bot>: {outputs[i]}'
                ])

            data['input'] = '\n'.join(chats)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except Exception as e:
            print(e)
            pass

397786it [02:16, 2912.44it/s] 


In [35]:
with open('prepared-malaysian-ultrachat.jsonl') as fopen:
    for l in tqdm(fopen):

        if 'tolong terjemah' in l.lower():
            continue

        try:
            data = json.loads(l)
            if not accept(data['input']):
                continue
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

361788it [02:32, 2378.11it/s]


In [36]:
files = glob('prepared-ultrachat-*')
files = [f for f in files if 'mesolitica' not in f]

count = 0
for f in files:
    print(f)
    with open(f) as fopen:
        
        for l in tqdm(fopen):
            
            if 'Saya mahir dalam bahasa' in l:
                continue
                
            if 'tolong terjemah' in l.lower():
                continue
                
            if '```' in l:
                continue
                    
            try:
                data = json.loads(l)
                if not accept(data['input']):
                    continue
                    
                if random.random() > 0.4:
                    continue
                    
                fopen_l.write(f'{json.dumps(data)}\n')
                fopen_l.flush()
                
                count += 1
            except:
                pass
            
count

prepared-ultrachat-train_8.jsonl.translated.malay.no-alignment


138162it [00:45, 3045.42it/s]


prepared-ultrachat-train_4.jsonl.translated.malay.no-alignment


121671it [00:56, 2138.77it/s]


prepared-ultrachat-train_9.jsonl.translated.malay.no-alignment


109062it [00:35, 3034.52it/s]


prepared-ultrachat-train_7.jsonl.translated.malay.no-alignment


121653it [00:50, 2396.65it/s]


prepared-ultrachat-train_1.jsonl.translated.malay.no-alignment


81561it [00:24, 3272.69it/s]


prepared-ultrachat-train_0.jsonl.translated.malay.no-alignment


89787it [00:28, 3203.03it/s]


prepared-ultrachat-train_6.jsonl.translated.malay.no-alignment


112491it [00:53, 2093.27it/s]


prepared-ultrachat-train_5.jsonl.translated.malay.no-alignment


113098it [00:54, 2093.34it/s]


prepared-ultrachat-train_3.jsonl.translated.malay.no-alignment


126937it [00:46, 2756.60it/s]


345884

In [37]:
with open('prepared-combine-multiple-chats-no-code.jsonl') as fopen:
    for l in fopen:

        if 'tolong terjemah' in l.lower():
            continue

        try:
            data = json.loads(l)
            if not accept(data['input']):
                continue
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

In [38]:
!wc -l combine-malay-no-alignment-multitasks-v5-no-code.jsonl

3419449 combine-malay-no-alignment-multitasks-v5-no-code.jsonl


In [39]:
from tqdm import tqdm

data = []
with open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        if 'function_call' in l and not isinstance(l['function_call'], str):
            print(l['function_call'])
        if generate_and_tokenize_prompt(l, min_length_input = 1, min_length_output = 1) is not None:
            data.append(l)

2616586it [00:26, 33991.60it/s] 

Apakah insentif yang disediakan oleh kerajaan Malaysia untuk syarikat yang menggunakan biojisim kelapa sawit untuk menghasilkan produk nilai tambah? 148  0 {'prompt_input': None, 'input': '<manusia>: Apakah insentif yang disediakan oleh kerajaan Malaysia untuk syarikat yang menggunakan biojisim kelapa sawit untuk menghasilkan produk nilai tambah?\n<bot>:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

2725260it [00:29, 22308.74it/s]

يف سورة اإلخالص داللة اجلزم
Assertive Semantic in Surah al-Ikhlas 



 
 2نورفائزة حممد حامدين ،1هشام الدين حممد يوسف 



 
 البحث   ملخص



قضية التوحيد عن  ألهنا تفسر    ، كيز على سورة اإلخالصداللة اجلزم يف النحو العريب ابلت   اسةالدر هذه  ت  تناول
اجلازم  االعتقاد  للدفاع عن  اجلزم هي  للمسلم  اخلالص  داللة  اآلايت   دالالت الحدى  إ .  تظهر من  اليت  الصوتية 



، فبناء على ذلك، البحث والدراسة يف توضيح مبينة  مقاصدمعينة و وهلا معان  من صفات األحرف العربية  القرآنية  
للغاية. ومقاصدها    معانيها  مهما  أمرا  فقط    يعد  الفعل  على  اجلزم  يقع  الرفعوملاذا  مثل  االسم   والنصب.  دون 



فعل األمر مباشرة وداللة اجلزم على على    اجلزم  لى داللةعلى منهج التحليل، ستقتصر الدراسة ع  تطرقت الدراسة
وأتثريها على   يف سورة اإلخالص   اللة اجلزم إىل بيان أمهية دالدراسة  وهدفت  .  زمعوامل اجلالفعل املضارع بواسطة  



والضغط   أن هناك شدة يف معىن الكلمة يف تلك السورة  أمهها  ، وتوصلت إىل عدة نتائجمعىن اآلية بشكل أعمق
 . السورةالصويت عند تالوة  







 داللة، اجلزم، سورة اإلخالص :  الكلمات املفتاحي

2733541it [00:29, 24892.29it/s]

 0 Berkawan dengan baik dan bekerjasama dengan sesama manusia adalah penting dalam membina masyarakat yang berkembang. Jika kita mempunyai hubungan yang baik dengan orang lain, kita boleh menguatkan dan membantu satu sama lain. Dalam Islam, kita diajar untuk membina hubungan yang baik dan mesra dengan semua orang, sama ada keluarga, rakan, jiran, atau masyarakat secara keseluruhan. Dalam hadis yang diriwayatkan oleh Abu Musa, Nabi Muhammad SAW bersabda, "Seorang mu'min terhadap mu'min yang lain adalah seperti bangunan yang kukuh, setiap bahagian menguatkan bahagian yang lain." Jadi, keakraban dan persaudaraan adalah penting dalam menyatukan umat Islam. Walaupun kita hidup dalam masyarakat yang berbilang bangsa dan budaya, tidak mustahil untuk membina keharmonian dan hubungan saling membantu, terutamanya jika setiap individu dalam masyarakat memahami peranan dan tanggungjawab mereka dalam memelihara ikatan persaudaraan. Hanya dengan itu, kita boleh menikmati kehidupan yang aman dan berk

3419449it [00:45, 74992.12it/s]


In [40]:
len(data)

3419427

In [41]:
with open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl', 'w') as fopen:
    for d in data:
        fopen.write(f'{json.dumps(d)}\n')

In [42]:
with open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        if 'function_call' in l and not isinstance(l['function_call'], str):
            print(l['function_call'])
        if generate_and_tokenize_prompt(l, min_length_input = 1, min_length_output = 1) is None:
            break

3419427it [00:43, 79308.47it/s] 


In [43]:
with open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        if 'function_call' in l:
            break

3419427it [00:26, 130990.52it/s]


In [44]:
with open('combine-malay-no-alignment-multitasks-v5-no-code.jsonl') as fopen:
    for l in fopen:
        if '-----' in l:
            print(l)

{"prompt_input": null, "input": "Berikan saya contoh surat berhenti kerja dengan notis sebulan. ", "output": "(Nama pekerja) \n(Alamat tempat tinggal pekerja) \n\n--------------------------------------------------------------------------------\n\nTarikh: (\u2013/\u2013/\u2014-) \n\n(Nama Syarikat)\n(Alamat Syarikat) \n\nPER: NOTIS BERHENTI KERJA SEBULAN DI (Nama Syarikat) \n\nPerkara di atas adalah dirujuk. \n\n2. Saya (Nama pekerja), bekerja di (Nama syarikat) memegang jawatan (Nama jawatan) ingin memberitahu pihak tuan bahawa saya akan berhenti kerja mulai tarikh (Tarikh berhenti kerja) dengan otis selama sebulan. \n\n3. Hal ini adalah kerana saya telah ditawarkan untuk bekerja sebagai (jawatan baharu) di (Nama syarikat) bermula pada (tarikh kerja baharu). \n\n4. Sehubungan itu, saya memohon agar pihak tuan dapat mempertimbangkan permohonan perletakan jawatan saya. Sebelum mengundurkan diri, ribuan terima kasih diucapkan kepada pihak tuan di atas peluang yang diberikan kepada saya se

In [45]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='combine-malay-no-alignment-multitasks-v5-no-code.jsonl',
    path_in_repo='combine-malay-no-alignment-multitasks-v5-no-code.jsonl',
    repo_id='mesolitica/instructions-dataset',
    repo_type='dataset',
)



combine-malay-no-alignment-multitasks-v5-no-code.jsonl:   0%|          | 0.00/11.8G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/instructions-dataset/commit/044eb0b0b1261eb8feabd9283d83338398715384', commit_message='Upload combine-malay-no-alignment-multitasks-v5-no-code.jsonl with huggingface_hub', commit_description='', oid='044eb0b0b1261eb8feabd9283d83338398715384', pr_url=None, pr_revision=None, pr_num=None)

In [46]:
!head -n 10 combine-malay-no-alignment-multitasks-v5-no-code.jsonl

{"prompt_input": null, "input": "Tuliskan sebuah cerita fiksyen pendek berdasarkan judul \"Bukan Airin\"", "output": "Judul: Bukan Airin\n\nAku menjeling majalah hiburan yang dibawa oleh rakan sekelas. Mereka agak sibuk memuji kecantikan seorang selebriti popular dalam busana mewah yang menghiasi halaman utama majalah tersebut.\n\n\u201cDah berumur pun masih cantik Airin ni. Dah besar nanti, aku nak ikut jejak langkah Airin.\u201d\n\n\u201cAku minat sangat dengan dia ni.\u201d\n\n\u201cMenyanyi dan berlakon semua hebat. Cantik sangat!\u201d\nAku mengerling sekilas kemudian menyambung catatanku yang terhenti tadi. Aku menulis sesuatu dalam buku nota itu.\n\n\u201cPopular, kaya-raya, cantik, berdisiplin, berjaya\u2026 memang idaman semua orang.\u201d Rakan-rakanku masih terus memuji. Aku mengeluh. Rasa sebal.\n\n\u201cTapi\u2026 dia ada rahsia, kan? Tak ada satu pun pihak media yang dapat bongkar macam mana rupa anak dia.\u201d Semua terdiam. Kemudian, Dayah memandang ke arahku. Aku buat