In [1]:
import numpy as np

import requests
import json

from difflib import SequenceMatcher
import re

from tqdm import tqdm

import sys
print(sys.executable)

/home/jupyter-simon_horvat/.conda/envs/simon_env/bin/python


In [None]:
from deep_translator import GoogleTranslator

API_URL = "http://localhost:11434/api/generate"
MODEL = "llama3.3:latest"

translator_de = GoogleTranslator(source="en", target="de")
translator_fr = GoogleTranslator(source="en", target="fr")

responses = []

for i, order in enumerate(tqdm(data['data'][:10], total=10, desc="Generovanie boolean otazok")):
    if len(order['paragraphs']) < 3:
        continue
    paragraph_en = order['paragraphs'][0]
    paragraph_de = order['paragraphs'][1]
    paragraph_fr = order['paragraphs'][2]
    
    context = paragraph_en['context']
    response = ask_question(context, "en", MODEL, API_URL, num_questions=10, prompt_number=3)

    responses.append(response)
    extend_response_file(response)
#     response = responses[i]
    
    json_pattern = r'\{(?:[^{}]|(?R))*\}'
    match = regex.search(json_pattern, response, re.DOTALL)

    if match:
        json_data = match.group(0)
        try:
            parsed_data = json.loads(json_data)
            
            max_id_en = max(q['id'] for q in paragraph_en['qas'])
            max_id_de = max(q['id'] for q in paragraph_de['qas'])
            max_id_fr = max(q['id'] for q in paragraph_fr['qas'])
            
            for i, qa in enumerate(parsed_data['qa']):
                answer = qa['a'].strip() if isinstance(qa['a'], str) else qa['a']
                qa_dict_en = {
                    "id": max_id_en + i + 1,
                    "question": qa['q'],
                    "lang": "en",
                    "answers": [{
                        "text": answer,
                        "answer_type": "boolean"
                    }]
                }
                qa_dict_de = {
                    "id": max_id_de + i + 1,
                    "question": translator_de.translate(qa["q"]),
                    "lang": "de",
                    "answers": [{
                        "text": answer,
                        "answer_type": "boolean"
                    }]
                }
                qa_dict_fr = {
                    "id": max_id_fr + i + 1,
                    "question": translator_fr.translate(qa["q"]),
                    "lang": "fr",
                    "answers": [{
                        "text": answer,
                        "answer_type": "boolean"
                    }]
                }
                
                paragraph_en["qas"].append(qa_dict_en)
                paragraph_en["qas"].append(qa_dict_de)
                paragraph_en["qas"].append(qa_dict_fr)
                
                paragraph_de["qas"].append(qa_dict_en)
                paragraph_de["qas"].append(qa_dict_de)
                paragraph_de["qas"].append(qa_dict_fr)
                
                paragraph_fr["qas"].append(qa_dict_en)
                paragraph_fr["qas"].append(qa_dict_de)
                paragraph_fr["qas"].append(qa_dict_fr)

        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    else:
        print("No JSON data found.")

In [41]:
def extract_json_responses(response):
    content_array = response.content.decode().split("\n")[:-1]
    content_array = [json.loads(j)["response"] for j in content_array]
    content = "".join(content_array)
    return content

def make_post_request(prompt):
    payload = {
        "model": MODEL,
        "prompt": prompt
    }
    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(API_URL, json=payload, headers=headers)

    if response.status_code != 200:
        print("Error: {}".format(response.status_code))

    return response

def is_yes_no_question(question):
    yes_no_keywords = ["is", "are", "do", "does", "did", "can", "could", "will", "would", "should", "has", "have", "had", "was"]
    return any(question.lower().startswith(keyword) for keyword in yes_no_keywords)

def validate_response(output):
    # Check if the output is a dictionary
    if not isinstance(output, dict):
        return False
    
    # Check if "id" is a string and exists in the dictionary
    if not isinstance(output.get("id"), str):
        return False
    
    # Check if "question" is a string and exists in the dictionary
    if not isinstance(output.get("question"), str):
        return False
    
    # Check if "answers" is a list
    answers = output.get("answers")
    if not isinstance(answers, list):
        return False
    
    # Check if each item in "answers" is a dictionary with the required keys
    for answer in answers:
        if not isinstance(answer, dict):
            return False
        if not set(answer.keys()).issubset({"text", "answer_type"}):
            return False
        if not isinstance(answer.get("text"), str) or not isinstance(answer.get("answer_type"), str):
            return False
    
    return True

In [42]:
def zly_format_pocet(data):
    zly_format = 0

    for item in data['data']:
        yes_no_paragraphs = []
        general_paragraphs = []
        for paragraph in item['paragraphs']:
            for question in paragraph['qas']:
                for answer in question['answers']:
                    answer_type = answer.get('answer_type')
                    if answer_type == "boolean" and not is_yes_no_question(question["question"]):
                        zly_format += 1

    return zly_format

def pocty_typov_odpovedi(data):
    # Overenie ci mame len otazky typu string

    """Counts the occurrences of each answer type in the dataset."""
    type_counts = {"string": 0, "number": 0, "boolean": 0}

    for item in data['data']:
        for paragraph in item['paragraphs']:
            for question in paragraph['qas']:
                for answer in question['answers']:
                    answer_type = answer.get('answer_type')
                    if answer_type in type_counts:
                        type_counts[answer_type] += 1

    return(f'Answer Type Statistics: {type_counts}')

In [43]:
response = '''{
       "id": "13",
       "question": "Is the pickup location located in Oklahoma City?",
       "answers": [
          {"text": "True", "answer_type": "boolean"}
       ]
    }'''

validate_response(json.loads(response))

True

In [57]:
with open("synthetic_data_200_SQuAD_placeholders_replaced.json", 'r') as file:
    data = json.load(file)
    print(len(data["data"]))
#     data["data"] = data["data"][:50]
#     print(len(data["data"]))

200


In [58]:
pocty_typov_odpovedi(data)

"Answer Type Statistics: {'string': 2156, 'number': 502, 'boolean': 1264}"

In [63]:
zly_format_pocet(data)

0

In [62]:
API_URL = "http://localhost:11434/api/generate"
MODEL = "mistral"

for item in tqdm(data['data'], total=len(data['data']), desc="Reformulacia otazok"):
    yes_no_paragraphs = []
    general_paragraphs = []
    for paragraph in item['paragraphs']:
        for question in paragraph['qas']:
            for answer in question['answers']:
                answer_type = answer.get('answer_type')
                if answer_type == "boolean" and not is_yes_no_question(question["question"]):
                    response = ""
                    counter = 1
#                     print(question)
#                     while not validate_response(response):
#                         if counter > 1:
#                             print("Opakovany prompt")
#                             print(f"invalid response: {response}")
                    prompt = f'''
                        Please reformulate these questions to questions that will start with question word and remain True/False questions: 
                         {question}
                        Please maintain the exact JSON format of questions, just replace former question with new formulation
                    '''
                    try:
                        response = json.loads(extract_json_responses(make_post_request(prompt)).strip())
#                           print(type(response))
                        counter += 1
                    except (json.JSONDecodeError, KeyError):
                        print(f"Error parsing response or missing 'question' key in response: {response}")

                    reformulated_question = response["question"]
                    question["question"] = reformulated_question
                            
    
    with open("synthetic_data_200_SQuAD_placeholders_replaced_reformulated.json", 'w') as file:
        json.dump(data, file, indent=4)

Reformulacia otazok: 100%|██████████| 200/200 [02:50<00:00,  1.17it/s]


In [None]:
yes = 0
no = 0

for item in data['data']:
    yes_no_paragraphs = []
    general_paragraphs = []
    for paragraph in item['paragraphs']:
        for question in paragraph['qas']:
            for answer in question['answers']:
                answer_type = answer.get('answer_type')
                if answer_type == "boolean":
                    print(question)
                    print(answer)
                    if answer["text"] == "True":
                        yes += 1
                    else:
                        no += 1
                        
print(yes, no)

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
model_name = "ramsrigouthamg/t5_paraphraser"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Prepare input text with the required prefix
input_text = "Please reformulate question given as question that starts with question word: The pickup location is in Oklahoma City?"
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the paraphrased output
outputs = model.generate(
    inputs,
    max_length=128,
    num_beams=5,
    early_stopping=True
)

# Decode and print the paraphrase
paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Paraphrased Question:", paraphrased_text)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Paraphrased Question: Please reformulate question given as question that starts with question word. The pickup location is in Oklahoma City?


In [19]:
# API_URL = "http://localhost:11434/api/generate"
# # MODEL = "llama3.3:latest"
# MODEL = "mistral"

# prompt = '''
#     Please reformulate these questions to questions that will start with question word and remain True/False questions: 
#     {'id': '13', 'question': 'The pickup location is in Oklahoma City', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     {'id': '14', 'question': 'The delivery location is in Belarus', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     {'id': '15', 'question': 'The pickup time is on July 6th at 13:00', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     {'id': '16', 'question': 'The delivery time is on July 9th at 18:00', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     {'id': '17', 'question': 'The cargo has a weight of 49 kg', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     {'id': '18', 'question': 'The cargo has dimensions of Length - 47m, Width - 17m, Height - 35m', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
#     Please maintain the exact JSON format of questions, just replace former question with new formulation'''

# response = extract_json_responses(make_post_request(prompt)).strip()
# print(response)

API_URL = "http://localhost:11434/api/generate"
# MODEL = "llama3.3:latest"
MODEL = "mistral"

prompt = '''
    Please reformulate these questions to questions that will start with question word and remain True/False questions: 
     {'id': '17', 'question': 'The cargo has a weight of 49 kg', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
    Please maintain the exact JSON format of questions, just replace former question with new formulation'''

response = extract_json_responses(make_post_request(prompt)).strip()
print(response)

{
     "id": "17",
     "question": "Is the weight of the cargo 49 kg?",
     "answers": [
       {"text": "True", "answer_type": "boolean"}
     ]
   }


In [6]:
# Overenie ci mame len otazky typu string

"""Counts the occurrences of each answer type in the dataset."""
type_counts = {"string": 0, "number": 0, "boolean": 0}

for item in general_data['data']:
    for paragraph in item['paragraphs']:
        for question in paragraph['qas']:
            for answer in question['answers']:
                answer_type = answer.get('answer_type')
                if answer_type in type_counts:
                    type_counts[answer_type] += 1
                    if answer_type == "boolean":
                        print(question)

f'Answer Type Statistics: {type_counts}'

{'id': '13', 'question': 'The pickup location is in Oklahoma City', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '14', 'question': 'The delivery location is in Belarus', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '15', 'question': 'The pickup time is on July 6th at 13:00', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '16', 'question': 'The delivery time is on July 9th at 18:00', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '17', 'question': 'The cargo has a weight of 49 kg', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '18', 'question': 'The cargo has dimensions of Length - 47m, Width - 17m, Height - 35m', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '19', 'question': 'A refrigerator truck (Frigo) is required for the transportation', 'answers': [{'text': 'True', 'answer_type': 'boolean'}]}
{'id': '20', 'question': 'The route includes a stop in LG Electronics, Oklahoma Cit

"Answer Type Statistics: {'string': 2134, 'number': 502, 'boolean': 213}"