In [None]:
import os
import time
import json
import random
import numpy as np
import re
import requests
from tqdm import tqdm
from deep_translator import GoogleTranslator
from deep_translator.exceptions import TranslationNotFound

from src.gotify_functions import send_gotify_notification
from src.data_utils.json_utils import read_json_file, save_json
from src.time_utils.time_formatters import format_time
from src.api_utils.http_requests_custom import make_post_request
from src.api_utils.response_parsers import extract_json_responses
from src.qa_tools.data_processing import count_answer_types_qas, count_answer_types_SQuAD
from src.data_processing.validation import validate_squad_format
from src.data_processing.dataset_utils import reshuffle_questions

In [None]:
# ==== Configuration ====

API_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.3:latest"
LANGS = ['en', 'de', 'fr']

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
DATA_PATH = os.path.join(PROJECT_ROOT, "data")
PROMPTS = read_json_file(os.path.join(PROJECT_ROOT, "prompts.json"))
COMPANIES = read_json_file(os.path.join(DATA_PATH, "placeholders/companies.json"))["companies"]

In [None]:
# ==== Prompt Generation ====
def generate_random_prompt(prompt_type, lang, sample):
    """Generates a random prompt based on type, language, and provided sample."""
    prompt_templates = PROMPTS[prompt_type][lang]
    selected_prompt = random.choice(prompt_templates)
    company = np.random.choice(COMPANIES)
    return selected_prompt.format(sample=sample, company=company)

def pick_question_prompt(prompt_type, lang, context, prompt_num=0, num_questions=10):
    selected_prompt = PROMPTS["question_generate"][prompt_type][lang][prompt_num]
    qa_format = PROMPTS["question_generate"]["qa_format"]
    return selected_prompt.format(context=context, num_questions=num_questions, qa_format=qa_format)

In [None]:
# ==== QA Extraction from response ====
def extract_qas(response, lang, translators, answer_type):
    json_pattern = r'\{[\s\S]*?"qa":\s*\[[\s\S]*?\][\s\S]*?\}'
    match = re.search(json_pattern, response)
    if not match:
        print("No JSON data found.")
        return []
    try:
        parsed_data = json.loads(match.group(0))
        qas_list = []
        for qa in parsed_data.get('qa', []):
            answer = qa['a']
            qa_dict = {
                "question": qa['q'],
                "answer": answer.strip() if isinstance(answer, str) else answer,
                "answer_type": answer_type,
                "lang": lang
            }
            qas_list.append(qa_dict)
            # Translate the question into other languages
            for translator in translators:
                try:
                    qas_list.append({
                        "question": translator.translate(qa_dict["question"]),
                        "answer": qa_dict["answer"],
                        "answer_type": answer_type,
                        "lang": translator.target
                    })
                except TranslationNotFound:
                    print(
                        f"Error: Translation not found.\n"
                        f"Question: {qa_dict['question']}\n"
                        f"Continuing the execution"
                    )
        return qas_list
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        print(response)
        print("------------------------------")
        return []

In [None]:
# ==== Main Data Generation ====
def main():
    source_dataset = read_json_file(os.path.join(DATA_PATH, "raw/synthetic_data_1000.json"))
    start_idx, end_idx = 328, 500

    progress = tqdm(total=end_idx-start_idx, desc="Generating contexts", dynamic_ncols=True)
    start_time = time.time()

    for entry in source_dataset[start_idx:end_idx]:
        translated_entry = {"sample": entry["sample"], "context": {}}
        translated_entry["context"]["en"] = entry["text"].strip()

        # Generate context for FR and DE
        for lang in ["fr", "de"]:
            prompt = generate_random_prompt("context_generate", lang.upper(), entry['sample'])
            translated_entry["context"][lang] = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()

        # Save to file
        out_path = os.path.join(DATA_PATH, "raw/synthetic_data_300_multilingual.json")
        if os.path.exists(out_path):
            data_from_file = read_json_file(out_path)
            merged_data = {'data': data_from_file["data"] + [translated_entry]}
        else:
            merged_data = {'data': [translated_entry]}
        save_json(out_path, merged_data)

        progress.update(1)

    progress.close()
    elapsed_time = time.time() - start_time
    avg_time_per_iter = elapsed_time / (end_idx - start_idx)
    print(f"Generating contexts completed\nTotal: {progress.n}/{end_idx-start_idx}\nElapsed: {elapsed_time:.2f}s\nAvg/iter: {avg_time_per_iter:.2f}s")

if __name__ == "__main__":
    main()