In [2]:
import time
import json
import numpy as np
import random
import re
import requests
from tqdm import tqdm
from deep_translator import GoogleTranslator
from deep_translator.exceptions import TranslationNotFound
import traceback
import os

from gotify_functions import send_gotify_notification
from data_utils.json_utils import read_json_file, save_json
from time_utils.time_formatters import format_time
from api_utils.http_requests_custom import make_post_request
from api_utils.response_parsers import extract_json_responses
from qa_tools.data_processing import count_answer_types_qas, count_answer_types_SQuAD
from data_processing.validation import validate_squad_format
from data_processing.dataset_utils import reshuffle_questions

In [3]:
API_URL = "http://localhost:11434/api/generate"
LANGS = ['en', 'de', 'fr'] # Supported languages

# Get the absolute path to the project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Moves up from 'notebooks' to 'MB'
DATA_PATH = os.path.join(PROJECT_ROOT, "data/")

PROMPTS = read_json_file(os.path.join(PROJECT_ROOT, "prompts.json"))

# Read JSON files with absolute paths
COMPANIES = read_json_file(os.path.join(DATA_PATH, "placeholders/companies.json"))["companies"]
STREETS = read_json_file(os.path.join(DATA_PATH, "placeholders/streets.json"))["streets"]
CITIES = read_json_file(os.path.join(DATA_PATH, "placeholders/cities.json"))["cities"]
COUNTRIES = read_json_file(os.path.join(DATA_PATH, "placeholders/countries.json"))["countries"]

In [4]:
def generate_random_prompt(prompt_type, lang, sample):
    """Generates a random prompt based on type, language, and provided sample."""
    try:
        prompt_templates = PROMPTS[prompt_type][lang]
        selected_prompt = random.choice(prompt_templates)
        company = np.random.choice(COMPANIES)
        return selected_prompt.format(sample=sample, company=company)
    except KeyError as e:
        raise ValueError(f"Invalid prompt type or language: {e}")

path = os.path.join(DATA_PATH, "raw/synthetic_data_1000.json")
source_dataset = read_json_file(path)
generate_random_prompt("context_generate", "FR", source_dataset[0]['sample'])

def pick_question_prompt(prompt_type, lang, context, prompt_num=0, num_questions=10):
    """Generates a random prompt based on type, language, and provided sample."""
    try:
        selected_prompt = PROMPTS["question_generate"][prompt_type][lang][prompt_num]
        qa_format = PROMPTS["question_generate"]["qa_format"]
        return selected_prompt.format(context=context, num_questions=num_questions, qa_format=qa_format)
    except KeyError as e:
        raise ValueError(f"Invalid prompt type or language: {e}")

In [28]:
# Check for unwanted answers in square brackets
def is_square_brackets_answer(answer):
    matches = re.findall(r'\[[^"\[\]]*\]', answer)
    return (len(matches) > 0)

def extract_qas(response, lang, translators, answer_type):
#     json_pattern = r'{.*}'
    json_pattern = r'\{[\s\S]*?"qa":\s*\[[\s\S]*?\][\s\S]*?\}'
#     match = re.search(json_pattern, response, re.DOTALL)
    match = re.search(json_pattern, response)
    if not match:
        print("No JSON data found.")
        return []
    
    try:
        parsed_data = json.loads(match.group(0))
#         print(parsed_data)
        qas_list = []
        for qa in parsed_data.get('qa', []):
            answer = qa['a']
            qa_dict = {
                "question": qa['q'],
                "answer": answer.strip() if isinstance(answer, str) else answer,
                "answer_type": answer_type,
                "lang": lang
            }
            qas_list.append(qa_dict)
            
            # Translate the question into other languages
            for translator in translators:
                try:
                    qas_list.append({
                        "question": translator.translate(qa_dict["question"]),
                        "answer": qa_dict["answer"],  # Answer remains in original language
                        "answer_type": answer_type,
                        "lang": translator.target
                    })
                except TranslationNotFound:
                    send_gotify_notification(
                        f"Error: Translation not found.\n"
                        f"Question: {qa_dict['question']}\n"
                        f"Continuing the execution"
                    )
                
        return qas_list
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        print(response)
        print("------------------------------")
        return []
    
def process_entry(item, langs, prompt_number=0):
    new_dict_item = {
        "sample": item["sample"],
        "context": item["context"],
        "qas": {}
    }
    for lang in langs:
        prompt = pick_question_prompt("general", lang, item['context'][lang], prompt_num=prompt_number)  # Generate questions for the given language
        response = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
        translators = [GoogleTranslator(source=lang, target=other) for other in langs if other != lang]
        new_dict_item["qas"][lang] = extract_qas(response, lang, translators, answer_type="string")  # Generate questions for the given language
    return new_dict_item

response = """
    Here are 10 questions based on the provided text in valid JSON format:

    {
        "qa": [
            {
                "q": "What time is the pickup scheduled for?",
                "a": "False"
            },
            {
                "q": "Is the shipment stackable?",
                "a": "False"
            },
            {
                "q": "Will a single driver be sufficient for this job?",
                "a": "False"
            },
            {
                "q": "Does the delivery require a special vehicle?",
                "a": "True"
            },
            {
                "q": "Is the pickup location in CITY_DELIVERY?",
                "a": "False"
            },
            {
                "q": "Are the goods heavy and bulky?",
                "a": "True"
            },
            {
                "q": "What is the weight of the shipment?",
                "a": "False"
            },
            {
                "q": "Is the delivery scheduled for January 11th, 2003?",
                "a": "False"
            },
            {
                "q": "Will a Big lorry be used to transport the goods?",
                "a": "True"
            },
            {
                "q": "Is the pickup scheduled before the delivery?",
                "a": "True"
            }
        ]
    }


    However, this response does not fully meet your requirements since the questions do not directly lead to a True or False answer based on their phrasing. Here's an alternative version with corrected question phrasing:

    {
        "qa": [
            {
                "q": "Is the shipment stackable?",
                "a": "False"
            },
            {
                "q": "Will a double driver be required for this job?",
                "a": "True"
            },
            {
                "q": "Is the delivery scheduled for January 11th, 2003?",
                "a": "False"
            },
            {
                "q": "Does the shipment weigh less than 50 kg?",
                "a": "False"
            },
            {
                "q": "Is the pickup location in CITY_DELIVERY?",
                "a": "False"
            },
            {
                "q": "Are the goods heavy and bulky?",
                "a": "True"
            },
            {
                "q": "Will a Big lorry be used to transport the goods?",
                "a": "True"
            },
            {
                "q": "Is the pickup scheduled after the delivery?",
                "a": "False"
            },
            {
                "q": "Does the shipment require a special vehicle for transportation?",
                "a": "True"
            },
            {
                "q": "Is a single driver sufficient for this job?",
                "a": "False"
            }
        ]
    }


    In this revised version, each question directly leads to an answer that can be identified as either True or False based on the provided text.
"""

lang = "en"
translators = [GoogleTranslator(source=lang, target=other) for other in LANGS if other != lang]
extract_qas(response, lang, translators, answer_type="boolean")

[{'question': 'What time is the pickup scheduled for?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'en'},
 {'question': 'Wann ist die Abholung geplant?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'de'},
 {'question': 'À quelle heure est prévue le ramassage?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'fr'},
 {'question': 'Is the shipment stackable?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'en'},
 {'question': 'Ist die Versandstapelbar?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'de'},
 {'question': "L'envoi est-il empilable?",
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'fr'},
 {'question': 'Will a single driver be sufficient for this job?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'en'},
 {'question': 'Wird ein einzelner Treiber für diesen Job ausreichen?',
  'answer': 'False',
  'answer_type': 'boolean',
  'lang': 'de'},
 {'question': 'Un seul pilote sera-t-il suff

# Generovanie textov multilingualne

In [14]:
API_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.3:latest"

source_dataset = read_json_file("synthetic_data_1000.json")

start_idx, end_idx = 328, 500
progress = tqdm(total=end_idx-start_idx, desc="Generating contexts", position=0, leave=True, dynamic_ncols=True)
start_time = time.time()

for entry in source_dataset[start_idx: end_idx]:
    translated_entry = {}
    translated_entry['sample'] = entry['sample']
    translated_entry['context'] = {}
    
    # EN
    translated_entry['context']['en'] = entry['text'].strip()
    
    # FR
    prompt = generate_random_prompt("context_generate", "FR", entry['sample'])
    translated_entry['context']['fr'] = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
    
    # DE
    prompt = generate_random_prompt("context_generate", "DE", entry['sample'])
    translated_entry['context']['de'] = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
    
    # Ulozenie do suboru
    data_from_file = read_json_file("synthetic_data_300_multilingual.json")
    merged_dict = {'data': data_from_file["data"] + [translated_entry]}
    save_json("synthetic_data_300_multilingual.json")
        
    progress.update(1)
    
progress.close()  # ✅ Explicitly close tqdm
    
elapsed_time = time.time() - start_time
avg_time_per_iter = elapsed_time / (end_idx-start_idx)

send_gotify_notification(
    f"Generating contexts completed\n"
    f"Total iterations: {progress.n}/{end_idx-start_idx}\n"
    f"Elapsed time: {format_time(elapsed_time)}\n"
    f"Avg time per iteration: {format_time(avg_time_per_iter)}"
)

print("Done")

Generating contexts: 100%|██████████| 172/172 [5:22:07<00:00, 112.37s/it]  


Done


In [9]:
len(read_json_file("synthetic_data_500_multilingual.json")["data"])

500

# Generovanie general otazok a odpovedi multilingualne

In [None]:
MODEL = "llama3.3:latest"

# Load input and output data files
data_from_file = read_json_file("synthetic_data_500_multilingual.json")
output_file_name = "synthetic_data_500_multilingual_qas.json"

data = {}
data['data'] = []
num_total_errors = 0

start_idx, end_idx = 459, 500  # Define the range of data entries to process
progress = tqdm(total=end_idx-start_idx, desc="Generovanie otazok a odpovedi", position=0, leave=True, dynamic_ncols=True)
start_time = time.time()

try:
    # Extract relevant portion of the dataset
    data_slice = data_from_file['data'][start_idx:end_idx]
    output_data = load_json_file(output_file_name)['data']
    
    for i, item in enumerate(data_slice):
        new_dict_item = process_entry(item, LANGS, 0)  # Process each item to generate Q&A pairs
        output_data.append(new_dict_item)
        save_json_file(output_file_name, {'data': output_data})  # Save the updated dataset
        progress.update(1)
        
        # Send periodic status update every 10 iterations
        if (i + 1) % 10 == 0:
            elapsed_time = time.time() - start_time
            send_gotify_notification(
                f"Progress Update: {progress.n}/{end_idx-start_idx} iterations completed.\n"
                f"Elapsed time: {format_time(elapsed_time)}"
            )
    
    elapsed_time = time.time() - start_time
    avg_time_per_iter = elapsed_time / (end_idx-start_idx)

    # Send completion notification
    send_gotify_notification(
        f"Generating questions completed\n"
        f"Total iterations: {progress.n}/{end_idx-start_idx}\n"
        f"Elapsed time: {format_time(elapsed_time)}\n"
        f"Avg time per iteration: {format_time(avg_time_per_iter)}"
    )

    print("Done") 
    
except Exception as e:
    # Handle errors and send notification
    error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}"
    send_gotify_notification(error_message)
    print(error_message)  # Optional: Also print to console 
    
finally:
    # Close progressbar even if an error occurs
    progress.close()  # ✅ Explicitly close tqdm

In [9]:
sum([1 for order in read_json_file("synthetic_data_500_multilingual_qas.json")["data"] if "qas" in order])

500

# Generovanie boolean otazok a priradenie do datasetu

In [5]:
# Po index 200 uz mame vygenerovane boolean otazky v subore "synthetic_data_500_multilingual_SQuAD.json" - preto nebudeme riesit indexy mensie ako 200
# Teraz vygenerujeme boolean otazky pre indexy vacsie ako 200, ulozime ich do suboru "synthetic_data_500_multilingual_qas.json".

In [None]:
MODEL = "llama3.3:latest"

# Load input and output data files
source_data = read_json_file("synthetic_data_500_multilingual_qas.json")
output_file_name = "synthetic_data_500_multilingual_qas.json"

data = {}
data['data'] = []
num_total_errors = 0

start_idx, end_idx = 400, 500  # Define the range of data entries to process
progress = tqdm(total=end_idx-start_idx, desc="Generovanie otazok a odpovedi boolean", position=0, leave=True, dynamic_ncols=True)
start_time = time.time()

try:
    # Extract relevant portion of the dataset
    data_slice = source_data['data'][start_idx:end_idx]
#     output_data = read_json_file(output_file_name)['data']
    
    for i, order in enumerate(data_slice):
        for lang in LANGS:
            context = order['context'][lang]
            if lang not in order["qas"]:
                continue
            qas = order["qas"][lang]
            prompt = pick_question_prompt("general", lang.upper(), context, prompt_num=2)  # Generate questions for the given language
            response = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
            translators = [GoogleTranslator(source=lang, target=other) for other in LANGS if other != lang]
            zoznam_qas = extract_qas(response, lang, translators, answer_type="boolean")  # Generate questions for the given language
#             print(response)
            order["qas"][lang] += zoznam_qas
#             print(zoznam_qas)
            #         output_data.append(new_dict_item)
        save_json(source_data, output_file_name)  # Save the updated dataset
        progress.update(1)
        
        # Send periodic status update every 10 iterations
        if (i + 1) % 10 == 0:
            elapsed_time = time.time() - start_time
            send_gotify_notification(
                f"Progress Update: {progress.n}/{end_idx-start_idx} iterations completed.\n"
                f"Elapsed time: {format_time(elapsed_time)}"
            )
    
    elapsed_time = time.time() - start_time
    avg_time_per_iter = elapsed_time / (end_idx-start_idx)

    # Send completion notification
    send_gotify_notification(
        f"Generating questions completed\n"
        f"Total iterations: {progress.n}/{end_idx-start_idx}\n"
        f"Elapsed time: {format_time(elapsed_time)}\n"
        f"Avg time per iteration: {format_time(avg_time_per_iter)}"
    )

    print("Done") 
    
except Exception as e:
    # Handle errors and send notification
    error_message = f"Error occurred: {str(e)}\n{traceback.format_exc()}"
    send_gotify_notification(error_message)
    print(error_message)  # Optional: Also print to console 
    
finally:
    # Close progressbar even if an error occurs
    progress.close()  # ✅ Explicitly close tqdm
    
#     print(json.dumps(source_data['data'][start_idx:end_idx], indent=4))

In [9]:
# Test ci funguje generovanie - neprejde ak je plna pamat na grafikach alebo su vytazene :(

MODEL = "llama3.3:latest"
# MODEL = "mistral"

# prompt = pick_question_prompt("general", lang.upper(), context, prompt_num=2)  # Generate questions for the given language
# response = extract_json_responses(make_post_request(prompt, MODEL, API_URL)).strip()
response = extract_json_responses(make_post_request("Ahoj", MODEL, API_URL)).strip()
response

'Here are 10 questions based on the provided text in valid JSON format:\n\n```\n{\n    "qa": [\n        {\n            "q": "What is the pickup time frame from November 9th to November 11th?",\n            "a": "False"\n        },\n        {\n            "q": "Are both items designated as ADR-designated dangerous goods?",\n            "a": "True"\n        },\n        {\n            "q": "Will the delivery take place on November 12th?",\n            "a": "False"\n        },\n        {\n            "q": "Is the weight of Item 1 greater than the weight of Item 2?",\n            "a": "True"\n        },\n        {\n            "q": "Does the pickup need to occur before 5:15 am on November 9th?",\n            "a": "False"\n        },\n        {\n            "q": "Are the items being transported from COMPANY_PICKUP to COMPANY_DELIVERY?",\n            "a": "True"\n        },\n        {\n            "q": "Is the delivery time frame between 1:15 pm and 3:30 pm on November 16th to November 19th?"

In [None]:
source_data = read_json_file("synthetic_data_500_multilingual_qas.json")["data"]
for i, order in enumerate(source_data[start_idx:end_idx]):
    print(f"Index: {start_idx + i}")
    for lang in LANGS:
        counter = 0
        if lang not in order["qas"]:
            continue
            
        for qa in order["qas"][lang]:
            if qa["answer_type"] == "boolean":
                counter += 1
        print(counter)
    print("-------------------")

In [5]:
file_path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas.json")
source_data = read_json_file(file_path)
# print(source_data["data"][0])
count_answer_types_qas(source_data, LANGS)

{'string': 40581, 'number': 0, 'boolean': 26648}

Error rate:
FR: 34 z 500 => 6.8%
DE: 64 z 500 => 12.8%

# Nahradenie True/False odpovedi v inych jazykoch

In [5]:
# staci nam nahradit od indexu 200, lebo SQuAD500 uz ma prvych 200 orders v spravnom formate aj nahradene placeholders

In [6]:
path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas.json")
source_data = read_json_file(path, verbose=True)
source_data["data"][200]["qas"]

Loading data from file: /home/jupyter-simon_horvat/MB/synthetic_data_500_multilingual_qas.json
Data loaded successfully, 1 items found.


{'en': [{'question': 'What is the date of pickup for the shipment?',
   'answer': 'January 11th, 2003',
   'lang': 'en',
   'answer_type': 'string'},
  {'question': 'Was ist das Datum der Abholung für die Sendung?',
   'answer': 'January 11th, 2003',
   'lang': 'de',
   'answer_type': 'string'},
  {'question': "Quelle est la date du ramassage pour l'expédition?",
   'answer': 'January 11th, 2003',
   'lang': 'fr',
   'answer_type': 'string'},
  {'question': 'What time is the pickup scheduled for?',
   'answer': '14:15',
   'lang': 'en',
   'answer_type': 'string'},
  {'question': 'Wann ist die Abholung geplant?',
   'answer': '14:15',
   'lang': 'de',
   'answer_type': 'string'},
  {'question': 'À quelle heure est prévue le ramassage?',
   'answer': '14:15',
   'lang': 'fr',
   'answer_type': 'string'},
  {'question': 'What type of vehicle will be used to transport the goods?',
   'answer': 'a Big lorry',
   'lang': 'en',
   'answer_type': 'string'},
  {'question': 'Mit welcher Art von

**Nahradenie True/False odpovedi v inych jazykoch:**

    1. najprv na zaklade typu odpovede ponechat len boolean otazky
    2. Pozriet sa na jedinecne hodnoty a ci koresponduju s priradenym jazykom odpovede
    3. Ak budu ku korespondovat, staci nahradit... Ak nebudu - vymysliet dalsi postup na zaklade aktualnej situacie

In [9]:
# 1. najprv na zaklade typu odpovede ponechat len boolean otazky

"""Counts unique boolean answers in the dataset."""
def zisti_unikatne_boolean_odpovede(data):
    odpovede = {}
    
    for order in data:
        for lang in LANGS:
            for qa in order.get("qas", {}).get(lang, []):
                if qa.get("answer_type") == "boolean":
                    odpovede[qa["answer"]] = odpovede.get(qa["answer"], 0) + 1

    return odpovede

zisti_unikatne_boolean_odpovede(source_data["data"])

# Povodny output:
# {'False': 4778,
#  'True': 4030,
#  'Falsch': 4412,
#  'Richtig': 4437,
#  'Vrai': 4470,
#  'Faux': 4521}

{'False': 13711, 'True': 12937}

In [8]:
# 2. Pozriet sa na jedinecne hodnoty a ci koresponduju s priradenym jazykom odpovede

# Nebudeme robit, lebo parameter "lang" hovori o jazyku otazky, nie jazyku odpovede (nekoresponduju)

In [9]:
# 3. Ak budu ku korespondovat, staci nahradit... Ak nebudu - vymysliet dalsi postup na zaklade aktualnej situacie
# Naozaj staci nahradit vyjadrenia v roznych jazykoch za zodpovedajuce True/False

nahrady = {"Richtig": "True", "Vrai": "True", "Falsch": "False", "Faux": "False"}

for order in source_data["data"]:
    for lang in LANGS:
        for qa in order.get("qas", {}).get(lang, []):
            if qa.get("answer_type") == "boolean":
                qa["answer"] = nahrady.get(qa["answer"], qa["answer"])  # Nahradíme ak existuje v mape
                
zisti_unikatne_boolean_odpovede(source_data["data"])    

# Output po nahradeni:
# {'False': 13711, 'True': 12937}

{'False': 13711, 'True': 12937}

In [10]:
path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas_placeholder_replaced.json")
with open(path, 'w') as file:
    json.dump(source_data, file, indent=4)

# Nahradenie placeholder hodnot za nahodne generovane

In [11]:
def replace_all_replacements(text, replacements):
    for key, value in replacements.items():
        pattern = re.compile(re.escape(key), re.IGNORECASE)
        text = pattern.sub(value, text)
        
    return text

def generate_random_data():
    """Generates random data for pickup and delivery."""
    pickup_company_name, delivery_company_name = random.sample(COMPANIES, 2)
    pickup_street_name, delivery_street_name = random.sample(STREETS, 2)
    pickup_city, delivery_city = random.sample(CITIES, 2)
    pickup_postal_code, delivery_postal_code = [str(random.randint(0, 100_000)).zfill(5) for _ in range(2)]
    pickup_country, delivery_country = random.sample(COUNTRIES, 2)

    return {
        "pickup": {
            "company": pickup_company_name,
            "street": pickup_street_name,
            "city": pickup_city,
            "postal_code": pickup_postal_code,
            "country": pickup_country,
        },
        "delivery": {
            "company": delivery_company_name,
            "street": delivery_street_name,
            "city": delivery_city,
            "postal_code": delivery_postal_code,
            "country": delivery_country,
        }
    }

def create_replacements(random_data):
    """Creates a dictionary of replacements for the pickup and delivery data."""
    pickup = random_data["pickup"]
    delivery = random_data["delivery"]

    replacements = {
        "COMPANY_PICKUP": pickup["company"],
        "STREET_PICKUP": pickup["street"],
        "CITY_PICKUP": pickup["city"],
        "ZIP_PICKUP": pickup["postal_code"],
        "COUNTRY_PICKUP": pickup["country"],
        "COMPANY_DELIVERY": delivery["company"],
        "STREET_DELIVERY": delivery["street"],
        "CITY_DELIVERY": delivery["city"],
        "ZIP_DELIVERY": delivery["postal_code"],
        "COUNTRY_DELIVERY": delivery["country"],
        
        # Odchytenie nepresnosti v nazvoch
        "CUSTOMER_PICKUP": pickup["company"],
        "COMPANY PICKUP": pickup["company"],
        "COMPANY\\_PICKUP": pickup["company"],
        "STREET PICKUP": pickup["street"],
        "STREET\\_PICKUP": pickup["street"],
        "STREET NAME PICKUP": pickup["street"],
        "CITY PICKUP": pickup["city"],
        "CITY\\_PICKUP": pickup["city"],
        "ZIP CODE PICKUP": pickup["postal_code"],
        "ZIP\\_PICKUP": pickup["postal_code"],
        "COUNTRY\\_PICKUP": pickup["country"],
        
        "CUSTOMER_DELIVERY": delivery["company"],
        "COMPANY DELIVERY": delivery["company"],
        "COMPANY\\_DELIVERY": delivery["company"],
        "STREET DELIVERY": delivery["street"],
        "STREET NAME DELIVERY": delivery["street"],
        "STREET\\_DELIVERY": delivery["street"],
        "CITY DELIVERY": delivery["city"],
        "CITY_DELIVER": delivery["city"],
        "CITY\\_DELIVERY": delivery["city"],
        "ZIP CODE DELIVERY": delivery["postal_code"],
        "ZIP\\_DELIVERY": delivery["postal_code"],
        "COUNTRY\\_DELIVERY": delivery["country"],       
    }
    
    return replacements

In [12]:
path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas_placeholder_replaced.json")
source_data = read_json_file(path)

start_idx, end_idx = 200, 500

# Updates the context and QAs of an item with replacements
for i, item in enumerate(source_data["data"][start_idx:end_idx]):
    replacements = create_replacements(generate_random_data())
    
    for lang in LANGS:
        item["context"][lang] = replace_all_replacements(item["context"][lang], replacements)
        if not lang in item["qas"]:
            continue
        for qa in item["qas"][lang]:
            qa["question"] = replace_all_replacements(qa["question"], replacements)
            qa["answer"] = replace_all_replacements(qa["answer"], replacements)
            # Dokoncit hladanie indexov odpovedi
            
print(json.dumps(source_data["data"][200], indent=4))

{
    "sample": {
        "pickup": {
            "company_name": "COMPANY_PICKUP",
            "street_name": "STREET_PICKUP",
            "city": "CITY_PICKUP",
            "postal_code": "ZIP_PICKUP",
            "country": "COUNTRY_PICKUP",
            "datetime": "2003-01-11 14:15:00"
        },
        "delivery": {
            "company_name": "COMPANY_DELIVERY",
            "street_name": "STREET_DELIVERY",
            "city": "CITY_DELIVERY",
            "postal_code": "ZIP_DELIVERY",
            "country": "COUNTRY_DELIVERY",
            "datetime": "2003-01-14 16:15:00"
        },
        "goods": {
            "weight": "97 kg",
            "dimensions": [
                {
                    "length": "58cm",
                    "width": "43cm",
                    "height": "48cm",
                    "weight": "97 kg"
                }
            ]
        },
        "stackable": "no",
        "required vehicle": "Big lorry",
        "special (vehicle) request": "Double

In [13]:
path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas_placeholder_replaced.json")
with open(path, 'w') as file:
    json.dump(source_data, file, indent=4)

# Prevod generovanych Q&A do SQuAD formatu

In [25]:
path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_qas_placeholder_replaced.json")
source_data = read_json_file(path, verbose=True)
print(count_answer_types_qas(source_data["data"][200:], LANGS))
print(zisti_unikatne_boolean_odpovede(source_data["data"]))

Loading data from file: /home/jupyter-simon_horvat/MB/synthetic_data_500_multilingual_qas_placeholder_replaced.json
Data loaded successfully, 1 items found.
{'string': 25239, 'number': 0, 'boolean': 26648}
{'False': 13711, 'True': 12937}


In [26]:
# # Pre indexy 0:199 pridame boolean otazky do SQuAD500

# for order in source_data["data"][:1]:
# #     print(order)
#     for paragraph in order["paragraphs"]:
# #         print(paragraph)
# #         print(paragraph["qas"])
#         for qa in paragraph["qas"]:
            
#             for answer in qa["answers"]:
#                 if answer["answer_type"] == "boolean":
                    

In [29]:
# Pre indexy 200:499 vytvorime nove zaznamy v SQuAD500

new_data = { "data": [] }
invalid_answers_counter = 0

start_idx, end_idx = 200, 500
for i, item in enumerate(source_data["data"][start_idx:end_idx]):
    order_dict = {
        "title": f"order{start_idx+i+1}",
        "paragraphs": []
    }
    for lang in LANGS:
        if not lang in item["qas"]:
            continue
        
        qas_list = []
        context = item["context"][lang].strip()
        
        for j, qa in enumerate(item["qas"][lang]):
            if is_square_brackets_answer(qa["answer"]):
                continue
                
            try:
                answer_type = qa["answer_type"]
                new_qa = {
                    "id": j+1,
                    "question": qa["question"],
                    "lang": qa["lang"],
                    "answers": [{
                        "text": qa["answer"],
                        "answer_type": answer_type,
                    }]
                }
                if answer_type == "string":
                    new_qa["answers"][0]["answer_start"] = context.index(qa["answer"])
                    
                qas_list.append(new_qa)
                
            except ValueError:
#                     print("Answer not present in Context!")
                    invalid_answers_counter += 1
                    
            
        
        order_dict["paragraphs"].append({
            "context": item["context"][lang].strip(),
            "lang": lang,
            "qas": qas_list
        })
    
    new_data["data"].append(order_dict)
    
print(invalid_answers_counter)
count_answer_types_SQuAD(new_data["data"])

2601


{'string': 22446, 'number': 0, 'boolean': 26648}

In [30]:
new_data["data"][0]

{'title': 'order201',
 'paragraphs': [{'context': 'Sure! Here\'s the natural language text based on the transportation order you provided:\n\n"We have a new transportation order for you! We need to pick up a shipment of goods from Robert Half International Inc., located at Daisy 5967 in Arlington, 52332, Lebanon on January 11th, 2003 at 14:15. We\'ll be using a Big lorry to transport the goods, which weighs 97 kg and measures 58cm x 43cm x 48cm. The shipment is not stackable, and we require a double driver for this job.\n\nWe\'ll be delivering the goods to J.C. Penney, located at Ash 5478 in Oklahoma City, 59235, Bosnia and Herzegovina on January 14th, 2003 at 16:15. The delivery time is quite specific, so please make sure to be there on time!\n\nThe goods are quite heavy and bulky, so we\'ll need a special vehicle to transport them safely. Please let us know if you have any questions or concerns about this job."\n\nI hope this text meets your requirements! Let me know if there\'s anyt

In [31]:
# Check if dataset follows SQuAD format

is_valid, validation_errors = validate_squad_format(new_data)
if is_valid:
    print("✅ Dataset follows SQuAD format!")
else:
    print("❌ Dataset has errors:")
    print("\n".join(validation_errors))

✅ Dataset follows SQuAD format!


In [32]:
existing_file_path = os.path.join(PROJECT_ROOT, "synthetic_data_200_multilingual_SQuAD_extended_boolean.json")
target_file_path = os.path.join(PROJECT_ROOT, "synthetic_data_500_multilingual_SQuAD.json")

existing_data = read_json_file(existing_file_path)
existing_data["data"] += new_data["data"]
print(len(existing_data["data"]))

# Check if dataset follows SQuAD format
is_valid, validation_errors = validate_squad_format(existing_data)
if is_valid:
    print("✅ Dataset follows SQuAD format!")
    save_json(file_path=target_file_path, data=existing_data, verbose=True)
else:
    print("❌ Dataset has errors:")
    print("\n".join(validation_errors))

500
✅ Dataset follows SQuAD format!
✅ JSON data successfully saved to /home/jupyter-simon_horvat/MB/synthetic_data_500_multilingual_SQuAD.json


In [34]:
source_data = read_json_file("synthetic_data_200_multilingual_SQuAD.json", verbose=True)

In [35]:
context_index = 2
answer_index = 8
context = source_data["data"][0]['paragraphs'][context_index]['context']
qas = source_data["data"][0]['paragraphs'][context_index]['qas']
print(qas[answer_index]['question'])
print(qas[answer_index]['answers'][0]['text'])
context[qas[answer_index]['answers'][0]['answer_start']:]
# qas

Wie hoch ist das Gesamtgewicht der zu transportierenden Ware?
92 kg


"92 kg.\n\n<h4>Exigences Spécifiques</h4>\n\n- **Empilement** : Les marchandises ne peuvent pas être empilées.\n- **Véhicule Requis** : Un camion léger (Small lorry) est nécessaire pour le transport.\n- **Demande Spéciale (Véhicule)** : Des planches latérales (Sideboards) sont obligatoires pour le véhicule.\n\nNous sommes convaincus que votre équipe expérimentée sera en mesure de gérer ces exigences avec soin et professionnalisme. Nous apprécierions une confirmation de réception de cette demande ainsi que des informations sur la disponibilité et les coûts associés à ce service.\n\nMerci d'avance pour votre temps et votre considération. Nous sommes impatients de collaborer avec vous pour ce transport.\n\nCordialement,\n\n[Votre Nom]"

In [36]:
counter = 0
for order in source_data["data"]:
#     print(order)
    for paragraph in order["paragraphs"]:
        for qa in paragraph["qas"]:
            counter += 1
counter

15171

In [9]:
source_data = read_json_file("synthetic_data_200_multilingual_qas.json", verbose=True)
source_data["data"][0]

{'sample': {'pickup': {'company_name': 'COMPANY_PICKUP',
   'street_name': 'STREET_PICKUP',
   'city': 'CITY_PICKUP',
   'postal_code': 'ZIP_PICKUP',
   'country': 'COUNTRY_PICKUP',
   'datetime': 'between 2021-03-04 13:30:00 and 2021-03-13 05:15:00'},
  'delivery': {'company_name': 'COMPANY_DELIVERY',
   'street_name': 'STREET_DELIVERY',
   'city': 'CITY_DELIVERY',
   'postal_code': 'ZIP_DELIVERY',
   'country': 'COUNTRY_DELIVERY',
   'datetime': 'between 2021-03-17 13:45:00 and 2021-03-24 18:45:00'},
  'goods': {'weight': '92 kg',
   'dimensions': [{'length': '96m',
     'width': '85m',
     'height': '12m',
     'weight': '11 kg'},
    {'length': '86m', 'width': '48m', 'height': '78m', 'weight': '12 kg'},
    {'length': '46m', 'width': '58m', 'height': '19m', 'weight': '69 kg'}]},
  'stackable': 'no',
  'required vehicle': 'Small lorry',
  'special (vehicle) request': 'Sideboards required'},
 'context': {'en': '  Sure! Here\'s the natural language text based on the provided JSON-str

[Dein Name]
[Dein Name]
[Dein Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Dein Name]
[Dein Name]
[Dein Name]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[insert contact details]
[insert contact details]
[insert contact details]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre nom]
[Votre nom]
[Votre nom]
[Votre nom]
[Votre nom]
[Votre nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre nom]
[Votre nom]
[Votre nom]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Dein Name]
[Dein Name]
[Dein Name]
[Votre Nom]
[Votre Nom]
[Votre Nom]
[Ihr Name]
[Ihr Name]
[Ihr Name]
[Votre Nom]
[Votre Nom]
[