In [1]:
#$ Entity Masking
import json
from span_marker import SpanMarkerModel


def load_entities_mapping(file_path):
    """
    Loads entities and their translations from the given JSON file, skipping invalid lines.
    Args:
        file_path (str): Path to the JSON file containing entity mappings.
    Returns:
        list: A list of dictionaries representing the JSON data.
    """
    with open(file_path, 'r') as jf:
        try:
            return json.load(jf)  # Load the entire JSON array
        except json.JSONDecodeError as e:
            print(f"Error loading JSON file: {e}")
            return []


def mask_entities_with_spanmarker(text, model):
    """
    Masks named entities in text using predictions from SpanMarkerModel.

    Args:
        text (Union[str, List[str]]): Input text(s). Can be a single string or a list of strings.
        model (SpanMarkerModel): Pretrained SpanMarker model for entity prediction.

    Returns:
        Union[Tuple[str, dict], List[Tuple[str, dict]]]:
            - For a single string:
                - masked_text (str): Text with entities replaced by placeholders.
                - mapping (dict): Mapping of placeholders to original entities.
            - For a list of strings:
                - List of (masked_text, mapping) tuples.
    """
    # Check if input is a single string or a list of strings
    is_single_string = isinstance(text, str)
    if is_single_string:
        text = [text]  # Convert single string to list for uniform processing

    # Perform predictions using the model
    predictions = model.predict(text)  # Handles batch processing

    results = []
    for i, sentence in enumerate(text):
        masked_text = sentence
        mapping = {}
        type_counters = {}
        offset = 0

        # If no predictions for this sentence
        if not predictions[i]:  # Empty list
            print(f"No entities found for text: {sentence}")
            results.append((masked_text, mapping))
            continue

        # Process each entity in the prediction
        for entity in predictions[i]:
            entity_text = entity['span']
            entity_type = entity['label'].upper()
            start = entity['char_start_index']
            end = entity['char_end_index']

            # Counter for entity type
            if entity_type not in type_counters:
                type_counters[entity_type] = 1
            else:
                type_counters[entity_type] += 1

            # Generate a placeholder
            placeholder = f"[ENTITY_{entity_type}_{type_counters[entity_type]}]"

            # Replace entity with placeholder in the text
            start += offset
            end += offset
            masked_text = masked_text[:start] + placeholder + masked_text[end:]
            offset += len(placeholder) - len(entity_text)

            # Add to mapping
            mapping[placeholder] = {
                "original_text": entity_text,
                "type": entity_type,
                "confidence": entity['score'],
                "start": start,
                "end": end
            }

        results.append((masked_text, mapping))

    #if text input is a single string
    return results[0] if is_single_string else results

def ensure_pad_token(model):
    if not hasattr(model.tokenizer, "pad_token") or model.tokenizer.pad_token is None:
        print("Setting pad_token manually...")
        model.tokenizer.pad_token = "[PAD]"
        model.tokenizer.pad_token_id = model.tokenizer.convert_tokens_to_ids("[PAD]")

In [2]:
#$ MT Translation

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the M2M100 model and tokenizer
model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

def translate_with_placeholders_m2m(masked_text, mapping, src_lang, tgt_lang):
    """
    Translates the non-entity parts of a text and re-inserts placeholders using M2M100.

    Args:
        masked_text (str): Text with placeholders for entities.
        mapping (dict): Mapping of placeholders to entity details.
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.

    Returns:
        str: Translated text with placeholders intact.
    """
    tokenizer.src_lang = src_lang

    # Encode and translate the entire sentence
    encoded_text = tokenizer(masked_text, return_tensors="pt")
    generated_tokens = model.generate(
        **encoded_text,
        forced_bos_token_id=tokenizer.get_lang_id(tgt_lang),
        max_length=128,  # Ensure sufficient length for translation
        early_stopping=True
    )

    # Decode the translated sentence
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text




In [3]:
#$ Entity Reintegration

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

def translate_entity(entity, src_lang, tgt_lang):
    """
    Translates an entity if its type requires translation, otherwise returns the original entity.

    Args:
        entity (str): The placeholder or entity to process.
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.

    Returns:
        str: The translated or original entity.
    """
    # types that should not be translated
    non_translatable_types = {"PER", "INST", "MEDIA", "PLANT", "VEHI"}

    # Check if the entity contains one of the non-translatable types
    if any(ntype in entity for ntype in non_translatable_types):
        return entity 

    # Translate the entity using M2M100
    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
    tokenizer.src_lang = src_lang
    encoded_entity = tokenizer(entity, return_tensors="pt")
    generated_tokens = model.generate(**encoded_entity, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang))
    translated_entity = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    return translated_entity


def reintegrate_entities(translation, mapping, src_lang, tgt_lang):
    """
    Replaces placeholders in the translated text with original or translated entities.

    Args:
        translation (str): Translated text with placeholders.
        mapping (dict): Mapping of placeholders to original entities.
        src_lang (str): Source language code.
        tgt_lang (str): Target language code.

    Returns:
        str: Translation with entities reintegrated.
    """
    for placeholder, entity_info in mapping.items():
        # Get the original entity
        original_entity = entity_info["original_text"]

        # Translate the entity if needed
        translated_entity = translate_entity(original_entity, src_lang, tgt_lang)

        # Replace placeholder with the translated entity
        translation = translation.replace(placeholder, translated_entity)

    return translation


In [4]:
import os
es_datapath = os.path.abspath('data/spanish_test.json')

    # Step 1: Load the data
print("Loading Spanish data...")
spanish_data = load_entities_mapping(es_datapath)

Loading Spanish data...


In [5]:
spanish_data

[{'id': '32fd2909',
  'source_locale': 'en',
  'target_locale': 'es',
  'source': "Who was the first son of Peter the Great's second wife?",
  'target': '¿Cuál fue el primer hijo de la segunda esposa de Pedro el Grande?',
  'entities': {'Q8479': {'es': 'Pedro I', 'en': 'Peter the Great'}},
  'from': 'mintaka'},
 {'id': '71a18e1d',
  'source_locale': 'en',
  'target_locale': 'es',
  'source': 'Who is considered the founder of the Xia dynasty in ancient China?',
  'target': '¿Quién es considerado como el fundador de la dinastía Xia en China antigua?',
  'entities': {'Q169705': {'es': 'dinastía Xia', 'en': 'Xia dynasty'}},
  'from': 'mintaka'},
 {'id': '9afa8edd',
  'source_locale': 'en',
  'target_locale': 'es',
  'source': 'Who was in charge of Great Britain during World War 2?',
  'target': '¿Quién estaba a cargo de Gran Bretaña durante la Segunda Guerra Mundial?',
  'entities': {'Q362': {'es': 'Segunda Guerra Mundial', 'en': 'World War II'},
   'Q23666': {'es': 'Gran Bretaña', 'en': '

In [6]:
print("Loading SpanMarker model...")
spanmarker_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")
ensure_pad_token(spanmarker_model)

Loading SpanMarker model...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Setting pad_token manually...


In [7]:
print("Loading M2M100 translation model...")
m2m_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
m2m_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

Loading M2M100 translation model...


In [None]:
translated_sentences = []

for idx, entry in enumerate(spanish_data):  # Process only the first 10 entries

    # Get source text (English) and target text (Italian)
    eng_text = entry['source']
    esp_text = entry['target']

    # Mask entities in the English text
    masked_eng_text, mapping = mask_entities_with_spanmarker(eng_text, spanmarker_model)
    print(f"Masked English Text: {masked_eng_text}")
    print(f"Entity Mapping: {json.dumps(mapping, indent=4)}")

    # Filter entities by confidence score
    filtered_mapping = {
        key: value for key, value in mapping.items() if value["confidence"] > 0.8
    }
    print(f"Filtered Mapping (Confidence > 0.8): {json.dumps(filtered_mapping, indent=4)}")

    # Translate non-entity parts of the text
    m2m_tokenizer.src_lang = "en"
    tgt_lang = "it"
    translated_text_with_placeholders = translate_with_placeholders_m2m(
        masked_eng_text, filtered_mapping, src_lang="en", tgt_lang="es"
    )
    print(f"Translated Text with Placeholders: {translated_text_with_placeholders}")

    # Reintegrate entities into the translated text
    final_translation = reintegrate_entities(
        translated_text_with_placeholders, filtered_mapping, src_lang="en", tgt_lang="es"
    )
    print(f"Final Translation: {final_translation}")

    # Append final translation to the results
    translated_sentences.append({
        "id": entry["id"],
        "source": eng_text,
        "target": esp_text,
        "masked_source": masked_eng_text,
        "translation": final_translation,
        "filtered_mapping": filtered_mapping
    })

# Save the results
results_path = "data/translated_spanish_test.json"
with open(results_path, "w") as results_file:
    json.dump(translated_sentences, results_file, indent=4)

print(f"Translation results for test set saved to {results_path}")


Masked English Text: Who was the first son of [ENTITY_PER_1]'s second wife?
Entity Mapping: {
    "[ENTITY_PER_1]": {
        "original_text": "Peter the Great",
        "type": "PER",
        "confidence": 0.9930858612060547,
        "start": 25,
        "end": 40
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_PER_1]": {
        "original_text": "Peter the Great",
        "type": "PER",
        "confidence": 0.9930858612060547,
        "start": 25,
        "end": 40
    }
}
Translated Text with Placeholders: ¿Quién fue el primer hijo de la segunda esposa de [ENTITY_PER_1]?


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Translation: ¿Quién fue el primer hijo de la segunda esposa de Pedro el Gran?
Masked English Text: Who is considered the founder of the [ENTITY_TIME_1] in ancient [ENTITY_LOC_1]?
Entity Mapping: {
    "[ENTITY_TIME_1]": {
        "original_text": "Xia dynasty",
        "type": "TIME",
        "confidence": 0.41634589433670044,
        "start": 37,
        "end": 48
    },
    "[ENTITY_LOC_1]": {
        "original_text": "China",
        "type": "LOC",
        "confidence": 0.9988638162612915,
        "start": 64,
        "end": 69
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_LOC_1]": {
        "original_text": "China",
        "type": "LOC",
        "confidence": 0.9988638162612915,
        "start": 64,
        "end": 69
    }
}
Translated Text with Placeholders: ¿Quién es considerado el fundador de [ENTITY_TIME_1] en el antiguo [ENTITY_LOC_1]?


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Translation: ¿Quién es considerado el fundador de [ENTITY_TIME_1] en el antiguo China?
Masked English Text: Who was in charge of [ENTITY_LOC_1] during [ENTITY_EVE_1]?
Entity Mapping: {
    "[ENTITY_LOC_1]": {
        "original_text": "Great Britain",
        "type": "LOC",
        "confidence": 0.9999412298202515,
        "start": 21,
        "end": 34
    },
    "[ENTITY_EVE_1]": {
        "original_text": "World War 2",
        "type": "EVE",
        "confidence": 0.9996432065963745,
        "start": 43,
        "end": 54
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_LOC_1]": {
        "original_text": "Great Britain",
        "type": "LOC",
        "confidence": 0.9999412298202515,
        "start": 21,
        "end": 34
    },
    "[ENTITY_EVE_1]": {
        "original_text": "World War 2",
        "type": "EVE",
        "confidence": 0.9996432065963745,
        "start": 43,
        "end": 54
    }
}
Translated Text with Placeholders: ¿Quién fue responsable de [EN

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Translation: ¿Quién fue responsable de Gran Bretaña durante Guerra Mundial 2?
Masked English Text: What is the longest river in the [ENTITY_LOC_1] that also flows through [ENTITY_LOC_2] and [ENTITY_LOC_3]?
Entity Mapping: {
    "[ENTITY_LOC_1]": {
        "original_text": "UK",
        "type": "LOC",
        "confidence": 0.9996622800827026,
        "start": 33,
        "end": 35
    },
    "[ENTITY_LOC_2]": {
        "original_text": "Wales",
        "type": "LOC",
        "confidence": 0.9999678134918213,
        "start": 72,
        "end": 77
    },
    "[ENTITY_LOC_3]": {
        "original_text": "England",
        "type": "LOC",
        "confidence": 0.9999411106109619,
        "start": 91,
        "end": 98
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_LOC_1]": {
        "original_text": "UK",
        "type": "LOC",
        "confidence": 0.9996622800827026,
        "start": 33,
        "end": 35
    },
    "[ENTITY_LOC_2]": {
        "original_text": "Wales",


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Translation: ¿Cuál es el río más largo en Reino Unido que también fluye a través de El País de Gales y Inglaterra?
Masked English Text: How many states are in the [ENTITY_LOC_1]?
Entity Mapping: {
    "[ENTITY_LOC_1]": {
        "original_text": "United States",
        "type": "LOC",
        "confidence": 0.999936580657959,
        "start": 27,
        "end": 40
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_LOC_1]": {
        "original_text": "United States",
        "type": "LOC",
        "confidence": 0.999936580657959,
        "start": 27,
        "end": 40
    }
}
Translated Text with Placeholders: ¿Cuántos estados hay en el [ENTITY_LOC_1]?


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Final Translation: ¿Cuántos estados hay en el Estados Unidos?
Masked English Text: What is the tallest point in the 3rd smallest country in [ENTITY_LOC_1]?
Entity Mapping: {
    "[ENTITY_LOC_1]": {
        "original_text": "Europe",
        "type": "LOC",
        "confidence": 0.999890923500061,
        "start": 57,
        "end": 63
    }
}
Filtered Mapping (Confidence > 0.8): {
    "[ENTITY_LOC_1]": {
        "original_text": "Europe",
        "type": "LOC",
        "confidence": 0.999890923500061,
        "start": 57,
        "end": 63
    }
}
Translated Text with Placeholders: ¿Cuál es el punto más alto en el tercer país más pequeño en [ENTITY_LOC_1]?
