## 1. Revisar espacios en blanco

### Descripción del Código:
Este script analiza archivos JSON que contienen anotaciones para tareas de reconocimiento de entidades nombradas (NER). Su objetivo principal es verificar si las entidades tienen espacios en blanco inválidos al inicio o al final de los textos que abarcan. El script genera un resumen de los resultados y guarda un archivo de log con los detalles de las entidades problemáticas.

### Cómo usar:
1. Coloca los archivos JSON que deseas analizar en una carpeta llamada `input_annotations` en el mismo directorio donde se encuentra este script.
2. Ejecuta el script.
3. El resumen de los resultados se mostrará en la terminal.
4. Un archivo de log detallado se guardará en la carpeta `logs` con el nombre `check_whitespace_log.txt`.

### Resultado:
- El script procesará todos los archivos JSON en la carpeta `input_annotations`.
- Identificará entidades con espacios en blanco inválidos y las reportará en el log y en la terminal.
- Si no se encuentran problemas, el log indicará que no hay espacios en blanco inválidos.

In [None]:
import json
import os
from datetime import datetime

# Define the directory containing annotation files (must be in the same directory as this script)
annotations_directory = os.path.join(os.getcwd(), "input_annotations")

# Define the logs directory
logs_directory = os.path.join(os.getcwd(), "logs")
os.makedirs(logs_directory, exist_ok=True)  # Create the logs directory if it doesn't exist

# Define the log file name
log_file_path = os.path.join(logs_directory, f"check_whitespace_log.txt")

# Initialize a summary dictionary
summary = {
    "total_files": 0,
    "total_annotations": 0,
    "total_entities": 0,
    "invalid_whitespace": []
}

# Check if the annotations directory exists
if not os.path.exists(annotations_directory):
    print(f"The folder 'input_annotations' does not exist in the current directory: {os.getcwd()}")
    exit()

# Process all JSON files in the directory
files = [f for f in os.listdir(annotations_directory) if f.endswith(".json")]
if not files:
    print(f"No JSON files found in the directory: {annotations_directory}")
    exit()

# Open the log file for writing
with open(log_file_path, "w", encoding="utf-8") as log_file:
    log_file.write("=== NER Annotation Analysis Log ===\n")
    log_file.write(f"Processed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log_file.write(f"Directory: {annotations_directory}\n\n")

    print("Processing files...\n")
    for file_name in files:
        file_path = os.path.join(annotations_directory, file_name)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            log_file.write(f"File not found: {file_path}\n")
            continue
        except json.JSONDecodeError:
            print(f"Error decoding JSON file: {file_path}")
            log_file.write(f"Error decoding JSON file: {file_path}\n")
            continue

        # Update summary
        annotations = data.get("annotations", [])
        summary["total_files"] += 1
        summary["total_annotations"] += len(annotations)

        # Check for invalid whitespace in entity spans
        for i, (text, annotation) in enumerate(annotations):
            for entity in annotation["entities"]:
                start, end, label = entity
                span_text = text[start:end]
                summary["total_entities"] += 1
                if span_text != span_text.strip():
                    summary["invalid_whitespace"].append({
                        "file_name": file_name,
                        "annotation_index": i,
                        "entity_text": span_text,
                        "label": label,
                        "start": start,
                        "end": end
                    })

    # Write summary to the log file
    log_file.write("=== Summary ===\n")
    log_file.write(f"Total files processed: {summary['total_files']}\n")
    log_file.write(f"Total annotations: {summary['total_annotations']}\n")
    log_file.write(f"Total entities analyzed: {summary['total_entities']}\n")
    log_file.write(f"Entities with invalid whitespace: {len(summary['invalid_whitespace'])}\n\n")

    if summary["invalid_whitespace"]:
        log_file.write("=== Invalid Whitespace Entities ===\n")
        for issue in summary["invalid_whitespace"]:
            log_file.write(f"- File: {issue['file_name']}, Annotation {issue['annotation_index']}: "
                           f"'{issue['entity_text']}' (label: {issue['label']}, "
                           f"start: {issue['start']}, end: {issue['end']})\n")
    else:
        log_file.write("No invalid whitespace found in any entity.\n")

# Print summary to the terminal
print("=== Summary ===")
print(f"Total files processed: {summary['total_files']}")
print(f"Total annotations: {summary['total_annotations']}")
print(f"Total entities analyzed: {summary['total_entities']}")
print(f"Entities with invalid whitespace: {len(summary['invalid_whitespace'])}\n")

if summary["invalid_whitespace"]:
    print("=== Invalid Whitespace Entities ===")
    for issue in summary["invalid_whitespace"]:
        print(f"- File: {issue['file_name']}, Annotation {issue['annotation_index']}: "
              f"'{issue['entity_text']}' (label: {issue['label']}, "
              f"start: {issue['start']}, end: {issue['end']})")
else:
    print("No invalid whitespace found in any entity.")

print(f"\nLog file saved to: {log_file_path}")

Procesando archivos...

=== Resumen ===
Total de archivos procesados: 1
Total de anotaciones: 51
Total de entidades analizadas: 105
Entidades con espacios en blanco inválidos: 0

No se encontraron espacios en blanco inválidos en ninguna entidad.

Archivo de log guardado en: /home/sotavento/Documents/tejer_red/1_procesamiento_anotaciones/logs/check_whitespace_log.txt


## 2. Combinar anotaciones

### Descripción del Código:
Este script combina múltiples archivos JSON que contienen anotaciones para tareas de reconocimiento de entidades nombradas (NER). Su objetivo es unificar las clases y anotaciones de todos los archivos en un único archivo llamado `combined_annotations.json`. Además, genera un archivo de log con un resumen del proceso.

### Cómo usar:
1. Coloca los archivos JSON que deseas combinar en una carpeta llamada `input_annotations` en el mismo directorio donde se encuentra este script.
2. Ejecuta el script.
3. El archivo combinado se guardará en el mismo directorio donde se ejecuta el script con el nombre `combined_annotations.json`.
4. Un archivo de log con el resumen del proceso se guardará en la carpeta `logs` con el nombre `merge_log.txt`.

### Resultado:
- El script procesará todos los archivos JSON en la carpeta `input_annotations`.
- Unificará las clases y anotaciones, evitando duplicados.
- Guardará el archivo combinado en el directorio actual y generará un log con los detalles del proceso.

In [3]:
import json
import os
from datetime import datetime

# Define the input and logs directories
annotations_directory = os.path.join(os.getcwd(), "input_annotations")
logs_directory = os.path.join(os.getcwd(), "logs")

# Ensure the logs directory exists
os.makedirs(logs_directory, exist_ok=True)

# Generate the output file name with the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = os.path.join(os.getcwd(), f"merged_annotations.json")

# Generate the log file name
script_name = "merge"  # Use a hardcoded name for the script
log_file_path = os.path.join(logs_directory, f"merge_log.txt")

# Initialize lists for classes and annotations
classes = []
annotations = []

# Check if the input directory exists
if not os.path.exists(annotations_directory):
    print(f"The folder 'input_annotations' does not exist in the current directory: {os.getcwd()}")
    exit()

# Get all JSON files in the input directory
files = [os.path.join(annotations_directory, f) for f in os.listdir(annotations_directory) if f.endswith(".json")]

if not files:
    print(f"No JSON files found in the directory: {annotations_directory}")
    exit()

# Open the log file for writing
with open(log_file_path, "w", encoding="utf-8") as log_file:
    log_file.write("=== JSON Merge Log ===\n")
    log_file.write(f"Script executed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log_file.write(f"Input directory: {annotations_directory}\n")
    log_file.write(f"Output file: {output_file}\n\n")

    # Process each JSON file
    log_file.write("Processing files:\n")
    for file in files:
        try:
            with open(file, encoding="utf-8") as f:
                data = json.load(f)
                log_file.write(f"- Successfully loaded: {file}\n")
                # Add classes while avoiding duplicates
                for cls in data.get("classes", []):
                    if cls not in classes:
                        classes.append(cls)
                # Add annotations while avoiding duplicates and filtering those with entities
                for ann in data.get("annotations", []):
                    if ann not in annotations and ann[1].get("entities"):  # Only add if there are entities
                        annotations.append(ann)
        except json.JSONDecodeError:
            log_file.write(f"- Error reading JSON file: {file}\n")
            print(f"Error reading JSON file: {file}")
            continue

    # Create the combined JSON structure
    combined_data = {
        "classes": classes,
        "annotations": annotations
    }

    # Save the combined file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(combined_data, f, ensure_ascii=False, indent=4)  # Use indentation for better readability

    log_file.write("\n=== Summary ===\n")
    log_file.write(f"Total files processed: {len(files)}\n")
    log_file.write(f"Total unique classes: {len(classes)}\n")
    log_file.write(f"Total annotations: {len(annotations)}\n")
    log_file.write(f"Combined file saved as: {output_file}\n")

print(f"Combined file saved as: {output_file}")
print(f"Log file saved as: {log_file_path}")

Combined file saved as: /home/sotavento/Documents/tejer_red/1_procesamiento_anotaciones/merged_annotations.json
Log file saved as: /home/sotavento/Documents/tejer_red/1_procesamiento_anotaciones/logs/merge_log.txt


## 3. Eliminacion de entidades inecesarias (opcional)

### Descripción del Código:
Este script permite eliminar entidades específicas de un archivo JSON llamado `merged_annotations.json`, que contiene anotaciones para tareas de reconocimiento de entidades nombradas (NER). El usuario puede seleccionar las entidades a eliminar mediante un rango o números individuales. El archivo actualizado se guarda como `purged_annotations.json` en el mismo directorio, y se genera un archivo de log con los detalles del proceso en la carpeta `logs`.

### Cómo usar:
1. Asegúrate de que el archivo `merged_annotations.json` esté en el mismo directorio donde se encuentra este script.
2. Ejecuta el script.
3. El script mostrará todas las entidades disponibles y te pedirá que selecciones cuáles deseas eliminar (usando números o rangos, por ejemplo, `1-3,5`).
4. Confirma la selección para proceder con la eliminación.
5. El archivo actualizado se guardará como `purged_annotations.json` en el mismo directorio.
6. Un archivo de log con los detalles del proceso se guardará en la carpeta `logs` con el nombre `purge_log.txt`.

### Resultado:
- El archivo `purged_annotations.json` contendrá las anotaciones actualizadas sin las entidades seleccionadas.
- El log incluirá un resumen de las entidades eliminadas y las que permanecen en el archivo.

In [4]:
import json
import os
from datetime import datetime

def load_json(file_path):
    """
    Load the JSON file containing annotations.
    :param file_path: Path to the JSON file.
    :return: Parsed JSON data.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def save_json(data, file_path):
    """
    Save the JSON data to a file.
    :param data: JSON data to save.
    :param file_path: Path to the output JSON file.
    """
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def purge_entities(data, entities_to_remove):
    """
    Remove specified entities from the JSON data.
    :param data: Original JSON data.
    :param entities_to_remove: List of entities to remove.
    :return: Updated JSON data.
    """
    # Remove entities from the "classes" list
    data["classes"] = [entity for entity in data["classes"] if entity not in entities_to_remove]

    # Remove entities from the "annotations"
    for annotation in data["annotations"]:
        annotation[1]["entities"] = [
            entity for entity in annotation[1]["entities"] if entity[2] not in entities_to_remove
        ]
    return data

def parse_selection(selection, total_entities):
    """
    Parse the user's selection of entities, allowing ranges and individual numbers.
    :param selection: String input from the user (e.g., "1-5,7,10-12").
    :param total_entities: Total number of entities available.
    :return: List of selected indices.
    """
    indices = []
    try:
        parts = selection.split(",")
        for part in parts:
            if "-" in part:  # Handle ranges (e.g., "1-5")
                start, end = map(int, part.split("-"))
                indices.extend(range(start, end + 1))
            else:  # Handle individual numbers (e.g., "7")
                indices.append(int(part))
        # Ensure indices are within valid range
        indices = [idx for idx in indices if 1 <= idx <= total_entities]
        return sorted(set(indices))  # Remove duplicates and sort
    except ValueError:
        raise ValueError("Invalid selection format. Use numbers or ranges (e.g., 1-5,7,10-12).")

def main():
    # Define the input file and output folder
    input_file = os.path.join(os.getcwd(), "merged_annotations.json")
    output_folder = os.getcwd()  # Save output in the same folder as the script
    logs_folder = os.path.join(os.getcwd(), "logs")
    os.makedirs(logs_folder, exist_ok=True)  # Ensure the logs folder exists

    # Generate the log file name
    log_file_path = os.path.join(logs_folder, f"purge_log.txt")

    # Check if the input file exists
    if not os.path.exists(input_file):
        print(f"The file 'combined_annotations.json' does not exist in the current directory: {os.getcwd()}")
        return

    # Generate the output file name with a timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = os.path.join(output_folder, f"purged_annotations.json")

    # Open the log file for writing
    with open(log_file_path, "w", encoding="utf-8") as log_file:
        log_file.write("=== Purging Log ===\n")
        log_file.write(f"Script executed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        log_file.write(f"Input file: {input_file}\n")
        log_file.write(f"Output file: {output_file}\n\n")

        # Load the JSON file
        log_file.write(f"Loading annotations from: {input_file}\n")
        print(f"Loading annotations from: {input_file}")
        data = load_json(input_file)

        # List all unique entities from "classes"
        log_file.write("\nAvailable entities in the annotations:\n")
        print("\nAvailable entities in the annotations:")
        all_entities = data["classes"]
        for i, entity in enumerate(all_entities, start=1):
            log_file.write(f"{i}. {entity}\n")
            print(f"{i}. {entity}")

        # Ask the user which entities to remove
        print("\nEnter the numbers or ranges of the entities you want to remove, separated by commas (e.g., 1-5,7,10-12):")
        selection = input("Your selection: ").strip()
        try:
            selected_indices = parse_selection(selection, len(all_entities))
            entities_to_remove = [all_entities[idx - 1] for idx in selected_indices]
        except ValueError as e:
            log_file.write(f"Error: {e}\n")
            print(f"Error: {e}")
            return

        # Confirm the selection
        log_file.write("\nSelected entities to remove:\n")
        print("\nYou have selected the following entities to remove:")
        for entity in entities_to_remove:
            log_file.write(f"- {entity}\n")
            print(f"- {entity}")
        confirm = input("\nDo you want to proceed? (yes/no): ").strip().lower()
        if confirm != "yes":
            log_file.write("Operation cancelled by the user.\n")
            print("Operation cancelled.")
            return

        # Purge the selected entities
        log_file.write("\nPurging selected entities...\n")
        print("\nPurging selected entities...")
        updated_data = purge_entities(data, entities_to_remove)

        # Save the updated JSON file
        save_json(updated_data, output_file)
        log_file.write(f"\nEntities purged successfully. Updated file saved to: {output_file}\n")
        print(f"\nEntities purged successfully. Updated file saved to: {output_file}")

        # Summary
        log_file.write("\nSummary:\n")
        log_file.write(f"Entities removed: {', '.join(entities_to_remove)}\n")
        log_file.write(f"Remaining entities: {', '.join([e for e in all_entities if e not in entities_to_remove])}\n")
        print("\nSummary:")
        print(f"Entities removed: {', '.join(entities_to_remove)}")
        print(f"Remaining entities: {', '.join([e for e in all_entities if e not in entities_to_remove])}")

if __name__ == "__main__":
    main()

Loading annotations from: /home/sotavento/Documents/tejer_red/1_procesamiento_anotaciones/merged_annotations.json

Available entities in the annotations:
1. NOMBRE
2. DOMICILIO
3. FECHA
4. HORA
5. TELEFONO
6. EXP
7. PLACA

Enter the numbers or ranges of the entities you want to remove, separated by commas (e.g., 1-5,7,10-12):

You have selected the following entities to remove:
- HORA

Purging selected entities...

Entities purged successfully. Updated file saved to: /home/sotavento/Documents/tejer_red/1_procesamiento_anotaciones/purged_annotations.json

Summary:
Entities removed: HORA
Remaining entities: NOMBRE, DOMICILIO, FECHA, TELEFONO, EXP, PLACA


## 4. Visualizar anotaciones después de combinar, depurar y antes del entrenamiento

### Descripción del Código:
Este script genera un archivo HTML interactivo para visualizar anotaciones de reconocimiento de entidades nombradas (NER) a partir de un archivo JSON llamado `purged_annotations.json`. Las entidades detectadas se resaltan en el texto con colores específicos, y se genera un archivo de log con los detalles del proceso.

### Cómo usar:
1. Asegúrate de que el archivo `purged_annotations.json` esté en el mismo directorio donde se encuentra este script.
2. Ejecuta el script.
3. El archivo HTML generado se guardará en el mismo directorio con el nombre `annotations_visualizer.html`.
4. Un archivo de log con los detalles del proceso se guardará en la carpeta `logs` con el nombre `visualizer_log.txt`.

### Resultado:
- El archivo HTML mostrará las anotaciones resaltadas con colores, indicando las entidades y sus etiquetas.
- El archivo de log incluirá un resumen de las entidades detectadas y los colores asignados.

In [None]:
import json
import os
from datetime import datetime

def load_json(file_path):
    """
    Load the JSON file containing annotations.
    :param file_path: Path to the JSON file.
    :return: Parsed JSON data.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def save_file(content, file_path):
    """
    Save content to a file.
    :param content: Content to save.
    :param file_path: Path to the output file.
    """
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

def main():
    # Define the input file and output file
    input_file = os.path.join(os.getcwd(), "purged_annotations.json")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = os.path.join(os.getcwd(), f"annotations_visualizer.html")
    logs_folder = os.path.join(os.getcwd(), "logs")
    os.makedirs(logs_folder, exist_ok=True)  # Ensure the logs folder exists
    log_file = os.path.join(logs_folder, f"visualizer_log.txt")

    # Check if the input file exists
    if not os.path.exists(input_file):
        print(f"The file 'purged_annotations.json' does not exist in the current directory: {os.getcwd()}")
        return

    # Load the JSON file
    print(f"Loading annotations from: {input_file}")
    data = load_json(input_file)

    # Dynamically detect entities from the file
    detected_entities = set()
    for annotation in data.get("annotations", []):
        for entity in annotation[1].get("entities", []):
            if len(entity) >= 3:
                detected_entities.add(entity[2])

    # Assign colors to detected entities
    entity_colors = {}
    color_palette = [
        "#FFCCCC", "#CCE5FF", "#FFFFCC", "#D5CCFF", "#FFCCF2", "#CCFFCC",
        "#FFCC99", "#FF99CC", "#99CCFF", "#FF9966", "#99FF99", "#FF9999",
        "#CCCCFF", "#99FFFF"
    ]
    for i, entity in enumerate(sorted(detected_entities)):
        entity_colors[entity] = color_palette[i % len(color_palette)]

    # Generate HTML content
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>NER Annotations Visualizer</title>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha3/dist/css/bootstrap.min.css" rel="stylesheet">
        <style>
            body { font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }
            .entity { display: inline-block; padding: 2px 4px; margin: 1px; border-radius: 4px; font-size: 0.9em; color: #000; }
            .entity small { font-size: 0.75em; color: #555; }
            .annotation-container { margin-bottom: 20px; padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9; }
        </style>
    </head>
    <body>
        <div class="container">
            <h1 class="text-center my-4">NER Annotations Visualizer</h1>
            <p class="text-muted text-center">This page highlights named entities in the text with their corresponding labels.</p>
    """

    # Process each annotation
    for annotation in data.get("annotations", []):
        if not annotation or not isinstance(annotation, list) or len(annotation) < 2:
            continue

        text = annotation[0]
        entities = annotation[1].get("entities", [])

        # Sort entities by their start index to avoid overlapping issues
        entities = sorted(entities, key=lambda x: x[0])

        # Annotate the text
        annotated_text = ""
        last_index = 0
        for entity in entities:
            if len(entity) < 3:
                continue
            start, end, label = entity
            # Add text before the entity
            annotated_text += text[last_index:start]
            # Add the entity with a span and color
            color = entity_colors.get(label, "#E0E0E0")  # Default color if label not found
            annotated_text += f'<span class="entity" style="background-color: {color};">{text[start:end]} <small>({label})</small></span>'
            last_index = end
        # Add remaining text after the last entity
        annotated_text += text[last_index:]

        # Add the annotated text to the HTML
        html_content += f"""
        <div class="annotation-container">
            <p>{annotated_text}</p>
        </div>
        """

    # Close the HTML content
    html_content += """
        </div>
        <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha3/dist/js/bootstrap.bundle.min.js"></script>
    </body>
    </html>
    """

    # Save the HTML file
    save_file(html_content, output_file)

    # Write log file
    with open(log_file, "w", encoding="utf-8") as log:
        log.write("=== NER Annotations Visualizer Log ===\n")
        log.write(f"Script executed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        log.write(f"Input file: {input_file}\n")
        log.write(f"Output HTML file: {output_file}\n")
        log.write("\nDetected Entities:\n")
        for entity, color in entity_colors.items():
            log.write(f"- {entity}: {color}\n")

    print(f"HTML visualizer created: {output_file}")
    print(f"Log file created: {log_file}")

if __name__ == "__main__":
    main()