In [44]:
from glob import glob
from tqdm import tqdm
import json
import os
from huggingface_hub import HfApi

api = HfApi()
folders = {
    'dockerfile': 'mixtral-dockerfile',
    'swift': 'mixtral-swift',
    'kotlin': 'mixtral-kotlin',
    'csharp': 'mixtral-c-sharp',
    'cpp': 'mixtral-cpp',
    'rust': 'mixtral-rust',
    'python': 'mixtral-python',
    'cuda': 'mixtral-cuda',
    'ruby': 'mixtral-ruby',
    'scala': 'mixtral-scala',
    'go': 'mixtral-go',
    'typescript': 'mixtral-typescript',
    'pythonanalytics': 'mixtral-data-analytics',
    'php': 'mixtral-php',
    'sql': 'mixtral-sql',
    'javascript': 'mixtral-javascript',
    'shell': 'mixtral-shell',
    'java': 'mixtral-java',
}

In [45]:
solutions = [
    '**Solution**',
    '**[Solution]**',
    '[Solution]',
    '**[Solution]:**',
    '**Solution:**',
    '**[Usage]**',
    'Solution:',
    '**[Problem Solution]**'
]

problems = [
    '**Problem Description**',
    '**[Problem Description]**',
    '[Problem Description]',
    '**[Problem Description]:**',
    '**Problem Description:**',
    'Problem Description:',
    '--------',
    '==========',
]

In [48]:
for k, v in folders.items():
    files = sorted(glob(f'{v}/*.json'), key = lambda x: os.stat(x).st_mtime)
    filtered = []
    rejected = []
    for f in tqdm(files):
        with open(f) as fopen:
            data = json.load(fopen)

        if not data:
            continue

        if '```' not in data:
            rejected.append(f)
            continue

        if data.count('```') % 2 != 0:
            rejected.append(f)
            continue
        
        if all([s not in data for s in solutions]):
            rejected.append(f)
            continue

        filtered.append(data)
    
    print(k, len(filtered), len(filtered) / len(files))

    out = f'{k}-00000-of-00001.jsonl'
    with open(out, 'w') as fopen:
        for f in filtered:
            f = f.strip()
            
            inside = []
            for s in solutions:
                if s in f:
                    inside.append(s)
            
            inside = sorted(inside, key = lambda x: len(x))
            split = inside[-1]
            
            splitted = f.strip().split(split)
            if len(splitted) > 2:
                continue
            ins = splitted[0]
            for p in problems:
                ins = ins.replace(p, '')
            ins = '\n\n'.join(ins.split('\n\n')[1:]).strip()
            answer = '\n\n'.join(splitted[1].split('\n\n')[1:]).strip()
            
            if '```' not in answer:
                continue

            if answer.count('```') % 2 != 0:
                continue
            
            d = {
                'instruction': ins,
                'answer': answer
            }
            fopen.write(f'{json.dumps(d)}\n')
    
    count = 0
    with open(out) as fopen:
        for l in fopen:
            count += 1
    
    print(out, count)
    
    api.upload_file(
        path_or_fileobj=out,
        path_in_repo=os.path.join('data', out),
        repo_id='mesolitica/mixtral-wizardcoder',
        repo_type='dataset',
    )


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 16773/16773 [00:00<00:00, 47165.75it/s]


dockerfile 13316 0.7938949502176116
dockerfile-00000-of-00001.jsonl 13286


dockerfile-00000-of-00001.jsonl:   0%|          | 0.00/36.0M [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 13314/13314 [00:00<00:00, 43389.12it/s]


swift 10330 0.775875018777227
swift-00000-of-00001.jsonl 10187


swift-00000-of-00001.jsonl:   0%|          | 0.00/32.5M [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 16010/16010 [00:00<00:00, 42500.43it/s]


kotlin 12898 0.8056214865708932
kotlin-00000-of-00001.jsonl 12790


kotlin-00000-of-00001.jsonl:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 13942/13942 [00:00<00:00, 44617.57it/s]


csharp 10314 0.739779084779802
csharp-00000-of-00001.jsonl 10201


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 19144/19144 [00:00<00:00, 52123.31it/s]


cpp 10860 0.5672795653990806
cpp-00000-of-00001.jsonl 10747
