In [None]:
import os
import time
import json
import random
import numpy as np
import re
import requests
import traceback
from tqdm import tqdm
from deep_translator import GoogleTranslator
from deep_translator.exceptions import TranslationNotFound

from src.gotify_functions import send_gotify_notification
from src.data_utils.json_utils import read_json_file, save_json
from src.time_utils.time_formatters import format_time
from src.api_utils.http_requests_custom import make_post_request
from src.api_utils.response_parsers import extract_json_responses
from src.qa_tools.data_processing import count_answer_types_qas, count_answer_types_SQuAD
from src.data_processing.validation import validate_squad_format
from src.data_processing.dataset_utils import reshuffle_questions

In [None]:
# ==== Configuration ====

API_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.3:latest"
LANGS = ['en', 'de', 'fr']

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
DATA_PATH = os.path.join(PROJECT_ROOT, "data")
PROMPTS = read_json_file(os.path.join(PROJECT_ROOT, "prompts.json"))
COMPANIES = read_json_file(os.path.join(DATA_PATH, "placeholders/companies.json"))["companies"]

In [None]:
def generate_random_prompt(prompt_type, lang, sample):
    """Generates a random prompt based on type, language, and provided sample."""
    try:
        prompt_templates = PROMPTS[prompt_type][lang]
        selected_prompt = random.choice(prompt_templates)
        company = np.random.choice(COMPANIES)
        return selected_prompt.format(sample=sample, company=company)
    except KeyError as e:
        raise ValueError(f"Invalid prompt type or language: {e}")

path = os.path.join(DATA_PATH, "raw/synthetic_data_1000.json")
source_dataset = read_json_file(path)
generate_random_prompt("context_generate", "FR", source_dataset[0]['sample'])

def pick_question_prompt(prompt_type, lang, context, prompt_num=0, num_questions=10):
    """Generates a random prompt based on type, language, and provided sample."""
    try:
        selected_prompt = PROMPTS["question_generate"][prompt_type][lang][prompt_num]
        qa_format = PROMPTS["question_generate"]["qa_format"]
        return selected_prompt.format(context=context, num_questions=num_questions, qa_format=qa_format)
    except KeyError as e:
        raise ValueError(f"Invalid prompt type or language: {e}")

In [None]:
# Check for unwanted answers in square brackets
def is_square_brackets_answer(answer):
    matches = re.findall(r'\[[^"\[\]]*\]', answer)
    return (len(matches) > 0)

def extract_qas(response, lang, translators, answer_type):
#     json_pattern = r'{.*}'
    json_pattern = r'\{[\s\S]*?"qa":\s*\[[\s\S]*?\][\s\S]*?\}'
#     match = re.search(json_pattern, response, re.DOTALL)
    match = re.search(json_pattern, response)
    if not match:
        print("No JSON data found.")
        return []

    try:
        parsed_data = json.loads(match.group(0))
#         print(parsed_data)
        qas_list = []
        for qa in parsed_data.get('qa', []):
            answer = qa['a']
            qa_dict = {
                "question": qa['q'],
                "answer": answer.strip() if isinstance(answer, str) else answer,
                "answer_type": answer_type,
                "lang": lang
            }
            qas_list.append(qa_dict)

            # Translate the question into other languages
            for translator in translators:
                try:
                    qas_list.append({
                        "question": translator.translate(qa_dict["question"]),
                        "answer": qa_dict["answer"],  # Answer remains in original language
                        "answer_type": answer_type,
                        "lang": translator.target
                    })
                except TranslationNotFound:
                    send_gotify_notification(
                        f"Error: Translation not found.\n"
                        f"Question: {qa_dict['question']}\n"
                        f"Continuing the execution"
                    )

        return qas_list
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        print(response)
        print("------------------------------")
        return []

def process_entry(item, langs, prompt_number=0):
    new_dict_item = {
        "sample": item["sample"],
        "context": item["context"],
        "qas": {}
    }
    for lang in langs:
        prompt = pick_question_prompt("general", lang, item['context'][lang], prompt_num=prompt_number)  # Generate questions for the given language
        response = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
        translators = [GoogleTranslator(source=lang, target=other) for other in langs if other != lang]
        new_dict_item["qas"][lang] = extract_qas(response, lang, translators, answer_type="string")  # Generate questions for the given language
    return new_dict_item

In [None]:
MODEL = "llama3.3:latest"

# Load input and output data files
data_from_file = read_json_file("synthetic_data_500_multilingual.json")
output_file_name = "synthetic_data_500_multilingual_qas.json"

data = {}
data['data'] = []
num_total_errors = 0

start_idx, end_idx = 459, 500  # Define the range of data entries to process
progress = tqdm(total=end_idx-start_idx, desc="Generovanie otazok a odpovedi", position=0, leave=True, dynamic_ncols=True)
start_time = time.time()

try:
    # Extract relevant portion of the dataset
    data_slice = data_from_file['data'][start_idx:end_idx]
    output_data = read_json_file(output_file_name)['data']

    for i, item in enumerate(data_slice):
        new_dict_item = process_entry(item, LANGS, 0)  # Process each item to generate Q&A pairs
        output_data.append(new_dict_item)
        save_json(output_file_name, {'data': output_data})  # Save the updated dataset
        progress.update(1)

        # Send periodic status update every 10 iterations
        if (i + 1) % 10 == 0:
            elapsed_time = time.time() - start_time
            send_gotify_notification(
                f"Progress Update: {progress.n}/{end_idx-start_idx} iterations completed.\n"
                f"Elapsed time: {format_time(elapsed_time)}"
            )

    elapsed_time = time.time() - start_time
    avg_time_per_iter = elapsed_time / (end_idx-start_idx)

    # Send completion notification
    send_gotify_notification(
        f"Generating questions completed\n"
        f"Total iterations: {progress.n}/{end_idx-start_idx}\n"
        f"Elapsed time: {format_time(elapsed_time)}\n"
        f"Avg time per iteration: {format_time(avg_time_per_iter)}"
    )

    print("Done")

except Exception as e:
    # Handle errors and send notification
    error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}"
    send_gotify_notification(error_message)
    print(error_message)  # Optional: Also print to console

finally:
    # Close progressbar even if an error occurs
    progress.close()  # ✅ Explicitly close tqdm