In [None]:
# !wget https://huggingface.co/datasets/mesolitica/mixtral-python-data-analytics-instructions/resolve/main/filtered-data-analytics-0.jsonl
# !wget https://huggingface.co/datasets/mesolitica/mixtral-python-data-analytics-instructions/resolve/main/filtered-data-analytics-1.jsonl

In [None]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json
from glob import glob

In [None]:
files = sorted(glob('filtered-data-analytics-*.jsonl'))
files

In [None]:
ls = []

for f in files:
    with open(f) as fopen:
        for l in fopen:
            l = json.loads(l)
            ls.append(l['content'])
len(ls)

In [None]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

def format_user(history):
    prompt = "<s>"
    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response}</s> "
    prompt += f"[INST]"
    return prompt

In [None]:
generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=5120,
    top_p=0.95,
    repetition_penalty=1.0,
    do_sample=True,
)

In [None]:
!mkdir mixtral-data-analytics
# !rm -rf mixtral-data-analytics/*.json

In [None]:
def answer(q, i):
    filename = f'mixtral-data-analytics/{i}.json'
    if os.path.exists(filename):
        return
    
    for _ in range(5):
        try:
            
            prompt = f"""
Please gain inspiration from the following random code snippet to create a high-quality programming problem. Present your output in two distinct sections: [Problem Description] and [Solution].

Code snippet for inspiration:
```
{q}
```

Guidelines for each section:

1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.

2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided.
""".strip()
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except Exception as e:
            if 'tokens + `max_new_tokens`' in str(e):
                print(e)
                with open(filename, 'w') as fopen:
                    json.dump(False, fopen)
                break
            pass

In [None]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [None]:
from threading import Thread
from queue import Queue

queue = Queue()
questions = ls
urls = [(q, no) for no, q in enumerate(questions)]
for u in urls:
    queue.put(u)
    
ori_size = queue.qsize()

In [None]:
max_worker = 100
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

In [None]:
for i in range(len(consumers)):
    consumers[i].join()