In [None]:
import pandas as pd
import time

from joblib import Parallel, delayed
from typing import Optional
from tqdm import tqdm
from configparser import ConfigParser

from groq import Groq

tqdm.pandas()

### Load and prepare dataset

In [None]:
df = pd.read_csv('your dataset path') # or df = load_from_disk('your dataset path').to_pandas()

In [None]:
instructions = df['input'] # or df['output']

### Setup 

In [None]:
config = ConfigParser()
config.read('../config.ini')

In [None]:
# Basic
GROQ_TOKEN = config['base']['GROQ_TOKEN']
N_JOBS = config['base']['N_JOBS']
OUTPUT_FILE_NAME = config['base']['OUTPUT_FILE_NAME']
SLEEP_TIME_S = config['base']['SLEEP_TIME_S']
TIMEOUT = config['base']['TIMEOUT']
NUM_RETRIES = config['base']['NUM_RETRIES']

# Generation
MODEL_NAME = config['groq_generation']['MODEL_NAME']
TEMPERATURE = config['groq_generation']['TEMPERATURE']
INSTRUCTION = config['groq_generation']['INSTRUCTION']
INSTRUCTION = INSTRUCTION if len(INSTRUCTION.strip()) > 0 else None

In [None]:
def request(text: str, instruction: Optional[str] = None):
	client = Groq(
	    api_key=GROQ_TOKEN,
	)
	
	def query(messages):
		response = client.chat.completions.create(
			messages=messages,
			model=MODEL_NAME,
			temperature=float(TEMPERATURE),
		)
		return response.choices[0].message.content

	finished = False
	retry = 0

	while not finished:
		try:
			messages = []
			
			if instruction is not None:
				messages.append(
					{
						'role': 'system',
						'content': instruction
					})
				
			messages.append({
					"role": "user",
					"content": text,
				})
			
			output = query(messages)
			
			response = output.replace(instruction, '').replace(text, '').replace('<|end|>', '').replace('<|assistant|>', '').replace('<|user|>', '').strip()
			
			return {'instruction': instruction, 'output': response, 'input': text.strip()}
		except:
			retry += 1
			time.sleep(int(SLEEP_TIME_S))
			
			if retry == int(NUM_RETRIES):
				return {'instruction': instruction, 'output': '', 'input': text.strip()}

### Start generating process

In [None]:
results = Parallel(n_jobs=N_JOBS, batch_size=1, timeout=int(TIMEOUT))(delayed(request)(instructions[i], INSTRUCTION) for i in tqdm(range(len(instructions))))

In [None]:
len(results)

### Remove failed responses

In [None]:
result_cleaned = [x for x in results if len(x['output']) > 0]
len(result_cleaned)

### Save to file

In [None]:
pd.DataFrame(result_cleaned).to_json(f'{OUTPUT_FILE_NAME}.jsonl', orient='records', lines=True)