In [1]:
from huggingface_hub import InferenceClient
from tqdm import tqdm
import os
import json



In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed

generate_kwargs = dict(
    temperature=1.0,
    max_new_tokens=4096,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1.0,
    do_sample=True,
)

In [3]:
client = InferenceClient(
    "", timeout = 120
)


def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

In [8]:
prompts = ['generate curse doubt sentences, eg, are you sure as hell'] * 1000  
len(prompts)

1000

In [5]:
!mkdir generate-doubt
!rm generate-doubt/*.json

rm: cannot remove 'generate-doubt/*.json': No such file or directory


In [9]:
prompt = prompts[0]
formatted_prompt = format_prompt(prompt, [])
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
output = stream.generated_text

In [10]:
output

' 1. Are you absolutely certain?\n2. Can you swear to it?\n3. Are you positive beyond a shadow of a doubt?\n4. Are you as sure as the day is long?\n5. Are you dead set on this?\n6. Are you one hundred percent convinced?\n7. Are you as sure as sunrise?\n8. Are you absolutely positive?\n9. Are you as sure as death and taxes?\n10. Are you as sure as the nose on your face?\n11. Are you as sure as the setting sun?\n12. Are you as sure as the ground beneath your feet?\n13. Are you as sure as the air you breathe?\n14. Are you as sure as the sky is blue?\n15. Are you as sure as your own name?'

In [11]:
def answer(q, i):
    filename = f'generate-doubt/{i}.json'
    if os.path.exists(filename):
        return
    
    while True:
        try:
            prompt = q
            formatted_prompt = format_prompt(prompt, [])
            stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=False, details=True, return_full_text=False)
            output = stream.generated_text
            with open(filename, 'w') as fopen:
                json.dump(output, fopen)
            break
        except Exception as e:
            print(e)
            pass

In [12]:
max_worker = 300

questions = prompts
for i in tqdm(range(0, len(questions), max_worker)):
    urls_ = [(q, no + i) for no, q in enumerate(questions[i: i + max_worker])]
    
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(answer, url[0], url[1]): url for url in urls_}

        for future in as_completed(futures):
            future.result()


  0%|                                                                                                                        | 0/4 [00:00<?, ?it/s]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/



 25%|████████████████████████████                                                                                    | 1/4 [01:08<03:24, 68.05s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/



 50%|████████████████████████████████████████████████████████                                                        | 2/4 [02:16<02:16, 68.18s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/
504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/



 75%|████████████████████████████████████████████████████████████████████████████████████                            | 3/4 [03:57<01:23, 83.29s/it]

504 Server Error: Gateway Time-out for url: https://mixtral.us-west-2.mesolitica.com/


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:05<00:00, 76.39s/it]


In [14]:
from glob import glob

files = glob('generate-doubt/*.json')
files = sorted(files, key = lambda x: int(x.split('/')[-1].replace('.json', '')))
        
len(files)

1000

In [15]:
import json

questions = []
for f in files:
    with open(f) as fopen:
        data = json.load(fopen)
    splitted = data.strip().split('\n')
    splitted = [s for s in splitted if '.' if s]
    splitted = ['.'.join(s.split('.')[1:]).strip() for s in splitted]
    splitted = [s for s in splitted if len(s) > 3]
    splitted = [s[1:] if s[0] == '"' else s for s in splitted]
    splitted = [s[:-1] if s[-1] == '"' else s for s in splitted]
    splitted = [s.strip() for s in splitted]
    questions.extend(splitted)
    
questions = [s for s in questions if len(s) > 3]
questions = [q for q in questions if 'Additionally' not in q]
questions = sorted(list(set(questions)))
len(questions)

3900

In [16]:
questions

['Also, I added some regional/slang variations, but please be aware that the level of appropriateness may vary depending on the audience and context.',
 "Also, it's important to be mindful of the context and the relationship with the person you are talking to, as some of these expressions could be considered too strong or rude in some situations.)",
 'Are you 100% certain about this?',
 'Are you 100% certain without a doubt?',
 'Are you 100% certain, without a sliver of a doubt?',
 'Are you 100% certain?',
 'Are you 100% convinced?',
 'Are you 100% persuaded?',
 "Are you 100% positive that you're not mistaken?",
 'Are you 100% positive?',
 'Are you 100% sure about that?',
 'Are you 100% sure?',
 'Are you 1000% certain?',
 'Are you 1000% certain? (informal, exaggerated)',
 'Are you 99% sure?',
 'Are you 99.9% sure?',
 'Are you God-damned certain?',
 'Are you a bit hesitant?',
 'Are you a bit skeptical?',
 'Are you a hundred percent certain?',
 'Are you a hundred percent convinced?',
 'A

In [17]:
with open('doubt.json', 'w') as fopen:
    json.dump(questions, fopen)