In [None]:
import pandas as pd
import requests
import time

from joblib import Parallel, delayed
from typing import Optional
from tqdm import tqdm
from configparser import ConfigParser

tqdm.pandas()

### Load and prepare dataset

In [None]:
df = pd.read_csv('your dataset path') # or df = load_from_disk('your dataset path').to_pandas()

In [None]:
instructions = df['input'] # or df['output']

### Setup 

In [None]:
config = ConfigParser()
config.read('../config.ini')

In [None]:
# Basic
HF_TOKEN = config['base']['HF_TOKEN']
N_JOBS = config['base']['N_JOBS']
OUTPUT_FILE_NAME = config['base']['OUTPUT_FILE_NAME']
SLEEP_TIME_S = config['base']['SLEEP_TIME_S']
TIMEOUT = config['base']['TIMEOUT']
NUM_RETRIES = config['base']['NUM_RETRIES']

# Generation
MODEL_API = config['generation']['MODEL_API_ENDPOINT']
MAX_NEW_TOKENS = config['generation']['MAX_RESPONSE_TOKENS']
INSTRUCTION = config['generation']['INSTRUCTION']
INSTRUCTION = INSTRUCTION if len(INSTRUCTION.strip()) > 0 else None

In [None]:
API_URL = MODEL_API
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

def request(text: str, instruction: Optional[str] = None):
	finished = False
	retry = 0
	
	system = f'<|system|>{instruction}<|end|>' if instruction is not None else ''
	while not finished:
		try:
			output = query({
				"inputs": f"{system}<|user|>\n{text}<|end|>\n<|assistant|>",
				"parameters": {
					"temperature": 0.6,
					"max_new_tokens": MAX_NEW_TOKENS
				  }
			})
			
			response = output[0]['generated_text'].replace(instruction, '').replace(text, '').replace('<|end|>', '').replace('<|assistant|>', '').replace('<|user|>', '').strip()
			
			return {'instruction': instruction, 'output': response, 'input': text.strip()}
		except:
			retry += 1
			time.sleep(int(SLEEP_TIME_S))
			
			if retry == NUM_RETRIES:
				return {'instruction': instruction, 'output': '', 'input': text.strip()}

### Start generating process

In [None]:
results = Parallel(n_jobs=N_JOBS, batch_size=1, timeout=int(TIMEOUT))(delayed(request)(instructions[i], INSTRUCTION) for i in tqdm(range(len(instructions))))

In [None]:
len(results)

### Remove failed responses

In [None]:
result_cleaned = [x for x in results if len(x['output']) > 0]
len(result_cleaned)

### Save to file

In [None]:
pd.DataFrame(result_cleaned).to_json(f'{OUTPUT_FILE_NAME}.jsonl', orient='records', lines=True)