In [1]:
import os

file_path = os.path.abspath('data/italian_w_labels.json')

filepathina = "/Users/inaocelli/Documents/CLASSES FALL 2024/CSCI 375/Final/EAMT/data/italian_w_lables.json"
print("Absolute path:", file_path)

Absolute path: /Users/inaocelli/Documents/CLASSES FALL 2024/CSCI 375/Final/EAMT/data/italian_w_labels.json


In [2]:
import json 

def get_subjects_by_Q_json(input_file, output_file):
    """
    Transforms the JSON data from the input file and writes the result to the output file.
    The output JSON uses the 'name' field from the 'answer' dictionary as keys and keeps
    the most complete version (fewer None values) of the label for each key.
    
    Args:
        input_file (str): Path to the input JSON file.
        output_file (str): Path to the output JSON file.
    """
    # Helper function to count non-None values in a label
    def count_non_none_values(label):
        return sum(1 for value in label.values() if value is not None)

    data = None
    # Load input JSON
    with open(input_file, 'r') as infile:
        data = json.load(infile)

    # Transform data
    transformed_data = {}
    for entry in data:
        answer = entry['answer']
        # print("this is the answer")
        # print(answer)
        if answer["answerType"] == 'entity' and answer["answer"] != None:
            local_answer = answer["answer"]
            #print(local_answer)
            key = local_answer[0]["name"]  # Use the "name" field in "answer" dictionary as key
            label = local_answer[0]["label"]
            
            # If the key is not in transformed_data or the current entry has more non-None fields, update the entry
            if key not in transformed_data or count_non_none_values(label) > count_non_none_values(transformed_data[key]["label"]):
                transformed_data[key] = {
                    "label": label,
                    "mention": entry["answer"]["mention"],
                    "category": entry["category"],
                    "complexityType": entry["complexityType"]
                }
        if answer["answerType"] == 'date':
            pass
            # maybe fill this in later if we think of something useful



    # Write transformed data to output file
    with open(output_file, 'w') as outfile:
        json.dump(transformed_data, outfile, indent=4)


def process_jsonl_and_json(jsonl_file, json_file, output_file, include_missing = True):
    """
    Processes a JSONL file and a JSON file to map entities with labels in specified languages.
    Args:
        jsonl_file (str): Path to the input JSONL file with the "entities" feature.
        json_file (str): Path to the input JSON file with entity details.
        output_file (str): Path to the output JSON file with modified entries.
    """
    # Load the second JSON file with entity details
    with open(json_file, 'r') as jf:
        entity_data = json.load(jf)
    
    # Initialize a list for the modified JSONL entries
    modified_entries = []

    # Process the JSONL file line by line
    cont_var = True
    with open(jsonl_file, 'r') as jlf:
        for line in jlf:
            cont_var = True

            # Parse the line as a dictionary
            entry = json.loads(line)
            
            # Extract necessary fields
            source_lang = entry.get("source_locale")
            target_lang = entry.get("target_locale")
            entities = entry.get("entities", [])
            
            # Build the "entities" dictionary for the output
            entities_dict = {}
            for entity_id in entities:
                # Check if the entity ID exists in the second JSON file
                if entity_id in entity_data:
                    entity_labels = entity_data[entity_id]["label"]
                    
                    # Extract the labels for the source and target languages
                    entities_dict[entity_id] = {
                        lang: entity_labels.get(lang)
                        for lang in [source_lang, target_lang]
                        if lang in entity_labels
                    }
                else:
                    if include_missing:
                        entities_dict[entity_id] = "NA"
                    else:
                        cont_var = False

            if not cont_var: # break out if we encounter something we can't translate
                continue
            # Add the new "entities" dictionary to the entry
            entry["entities"] = entities_dict
            
            # Append the modified entry to the list
            modified_entries.append(entry)

    # Write the modified entries to the output JSON file
    with open(output_file, 'w') as outf:
        for entry in modified_entries:
            outf.write(json.dumps(entry) + '\n')

def update_source_locale(jsonl_file, new_locale, output_file):
    """
    Updates the source_locale field for all elements in a JSONL file.

    Args:
        jsonl_file (str): Path to the input JSONL file.
        new_locale (str): The new value for the source_locale field.
        output_file (str): Path to the output JSONL file.
    """
    # Open the input JSONL file and process line by line
    with open(jsonl_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            entry = json.loads(line)  # Parse the line as a dictionary
            entry["source_locale"] = new_locale  # Update the source_locale field
            outfile.write(json.dumps(entry) + '\n')  # Write the updated entry


In [13]:
def load_entities_mapping(file_path):
    """
    Loads entities and their translations from the given JSON file.
    Args:
        file_path (str): Path to the JSON file containing entity mappings.
    Returns:
        dict: A dictionary mapping entity IDs to their translations.
    """
    with open(file_path, 'r') as f:
        return json.load(f)


def mask_entities(text, entities, entity_mapping):
    """
    Replaces named entities in the text with unique placeholders based on their category/type.
    Args:
        text (str): The original text.
        entities (dict): Entities with translations and IDs.
        entity_mapping (dict): Mapping of Q-IDs to their details.
    Returns:
        masked_text (str): Text with entities replaced by placeholders.
        mapping (dict): Mapping of placeholders to original entities.
    """
    masked_text = text
    mapping = {}
    type_counters = {}  # To track counts for each entity type
    offset = 0  # Track character shift due to replacements

    for entity_id, entity_data in entities.items():
        entity_text = entity_data['en']  # Assume English entity text is provided
        entity_details = entity_mapping.get(entity_id, {})
        entity_type = entity_details.get('category', 'UNKNOWN').upper()  # Default type to 'UNKNOWN'

        # Initialize the counter for this type if not already done
        if entity_type not in type_counters:
            type_counters[entity_type] = 1
        else:
            type_counters[entity_type] += 1

        # Generate placeholder
        placeholder = f"[ENTITY_{entity_type}_{type_counters[entity_type]}]"

        # Find the entity text in the original text
        start = text.find(entity_text)
        if start == -1:
            print(f"Entity '{entity_text}' not found in text.")
            continue  # Skip if the entity text isn't found

        end = start + len(entity_text)

        # Replace entity with placeholder
        start += offset
        end += offset
        masked_text = masked_text[:start] + placeholder + masked_text[end:]

        # Update offset and store mapping
        offset += len(placeholder) - len(entity_text)
        mapping[placeholder] = {
            "original_text": entity_text,
            "label": entity_details.get('label', {}).get('en', 'UNKNOWN'),
            "id": entity_id,
            "type": entity_type
        }

    return masked_text, mapping



In [14]:
q_datapath_ina = "/Users/inaocelli/Documents/CLASSES FALL 2024/CSCI 375/Final/EAMT/data/Q_data.json"

In [15]:
import json

with open(filepathina, 'r') as f:
    examples = [json.loads(line) for line in f]

entity_mapping = load_entities_mapping(q_datapath_ina)  # Update path if needed

example = examples[0]
masked_text, mapping = mask_entities(example['source'], example['entities'], entity_mapping)

print("Masked Text:", masked_text)
print("Mapping:", mapping)


Masked Text: What year was the first book of the [ENTITY_BOOKS_1] series published?
Mapping: {'[ENTITY_BOOKS_1]': {'original_text': 'A Song of Ice and Fire', 'label': 'A Song of Ice and Fire', 'id': 'Q45875', 'type': 'BOOKS'}}


In [16]:
print("Entities:", example['entities'])


Entities: {'Q45875': {'it': 'Cronache del ghiaccio e del fuoco', 'en': 'A Song of Ice and Fire'}}


In [17]:
# Debugging example and entities
print("Original Text:", example['source'])
print("Entities:", example['entities'])

# Check entity mapping
for entity_id in example['entities']:
    print(f"Entity ID: {entity_id}, Mapping in Q_data: {entity_mapping.get(entity_id)}")

# Run masking function
masked_text, mapping = mask_entities(example['source'], example['entities'], entity_mapping)

print("Masked Text:", masked_text)
print("Mapping:", mapping)


Original Text: What year was the first book of the A Song of Ice and Fire series published?
Entities: {'Q45875': {'it': 'Cronache del ghiaccio e del fuoco', 'en': 'A Song of Ice and Fire'}}
Entity ID: Q45875, Mapping in Q_data: {'label': {'en': 'A Song of Ice and Fire', 'ar': 'أغنية الجليد والنار', 'de': 'Das Lied von Eis und Feuer', 'es': 'Canción de hielo y fuego', 'fr': 'Le Trône de fer', 'hi': 'अ सॉंग ऑफ आईस एंड फायर', 'it': 'Cronache del ghiaccio e del fuoco', 'ja': '氷と炎の歌', 'pt': 'As Crônicas de Gelo e Fogo'}, 'mention': 'Game of Thrones', 'category': 'books', 'complexityType': 'comparative'}
Masked Text: What year was the first book of the [ENTITY_BOOKS_1] series published?
Mapping: {'[ENTITY_BOOKS_1]': {'original_text': 'A Song of Ice and Fire', 'label': 'A Song of Ice and Fire', 'id': 'Q45875', 'type': 'BOOKS'}}
