In [30]:
import numpy as np
import pandas as pd
import random
import json
import re
import os
import yaml

import importlib


In [31]:
raw_names_data_path = f"auxiliary_data/final/"

save_path = f"anon_data/"

bge_data_path = f"bge_data/"

load config and utils

In [32]:
with open(f"config_synthetic_generation.yaml",'r') as f:
    config = yaml.safe_load(f)

#import the utils file -force reload
import utils
importlib.reload(utils)
from utils import tokenize_text, join_tokens

load random_generators

In [33]:
from random_name_samplers import RandomNameGenerator,RandomPlaceGenerator,RandomOrganisationGenerator

In [34]:
config

{'organization_endings': ['AG',
  'GmbH',
  'KG',
  'OHG',
  'e.K.',
  'eK',
  'UG',
  'Inc.',
  'Ltd.',
  'Ltd',
  'LLC',
  'PLC',
  'LP',
  'LLP',
  'SA',
  'SARL',
  'Sàrl',
  'SAS',
  'SNC',
  'EURL',
  'S.p.A.',
  'S.r.l.',
  'S.a.p.a.',
  'S.n.c.',
  'S.a.s.',
  'SpA',
  'Srl',
  'Sapa',
  'Sas'],
 'cutoff': {'start_keywords': ['Participants à la procédure',
   'Parteien',
   'Verfahrensbeteiligte',
   'Partecipanti al procedimento',
   'Parties'],
  'end_keywords': ['Lausanne', 'Losanna']}}

# Function declarations

In [44]:
import pandas as pd
import re


# Define a function to manipulate the text
def cutoff_clear_names(text):
    # Define the patterns for the start and end keywords
    start_keywords = config['cutoff']['start_keywords']
    end_keywords = config['cutoff']['end_keywords']

    # Create the regex pattern for the start keywords
    start_pattern = r'|'.join(re.escape(kw) for kw in start_keywords)

    # Create the regex pattern for the end keywords
    end_pattern = r'|'.join(re.escape(kw) for kw in end_keywords)

    # Find the start position
    start_match = re.search(start_pattern, text)
    if start_match:
        start_pos = start_match.start()
    else:
        start_pos = 0  # Start from the beginning if no match is found

    # Find the end position
    end_match = list(re.finditer(end_pattern, text)) #modification re.search -> re.finditer for cases where Lausanne appears as a place somewhere in the text and not only at the end
    if end_match:
        end_pos = end_match[-1].end()
    else:
        end_pos = len(text)  # End at the last character if no match is found

    # Extract the relevant text
    return text[start_pos:end_pos]

In [36]:
# Function to remove non-letter characters from the start and end
def strip_non_letters(text : str) -> str:
    text = text.strip()
    return re.sub(r"^[^a-zA-Z]+|[^a-zA-Z]+$", "", text)


def text_token_to_label_matching(tokenized_text,ner):
  for ner_tag in ner:
    start_index = ner_tag[0]
    end_index = ner_tag[1]
    print(f"{tokenized_text[start_index:end_index+1]} -> {ner_tag[2]}")

In [37]:
def replace_entities(original_text : str, entity_to_name_mapping : dict) -> str:
    # Collect all start and end positions with their replacement text
    replacements = []
    for entity, name in entity_to_name_mapping.items():
        for match in re.finditer(re.escape(entity), original_text):  # Escape entities to handle special characters
            replacements.append((match.start(), match.end(), name))

    # Sort replacements by start position
    replacements = sorted(replacements, key=lambda x: x[0])

    # Build the modified text
    modified_text = []
    last_pos = 0
    for start, end, name in replacements:
        # Append unchanged text since the last match, then the replacement
        modified_text.append(original_text[last_pos:start])
        modified_text.append(name + " ") #we add a white-space to resolve A.________SA -> JelmoliSA -> tokenize(JelmoliSA) = JelsomliSA != Jelmoli in subsequent token matching
        last_pos = end

    # Append any remaining text after the last replacement
    modified_text.append(original_text[last_pos:])

    return ''.join(modified_text)

from numpy.lib.stride_tricks import sliding_window_view
def get_ner_labels(tokenized_modified_text : list, entity_to_name_mapping : dict, entity_to_label_mapping : dict) -> list[list]:

  # Tokenize the names in the entity_to_name_mapping dict
  synthetic_tokens = {entity : tokenize_text(name) for entity,name in entity_to_name_mapping.items()}

  # convert to array
  tokenized_modified_text_array = np.array(tokenized_modified_text)

  ner_tags = []

  for entity, tokens in synthetic_tokens.items():
        # convert to array
        tokens_array = np.array(tokens)
        # Sliding window over tokenized_modified_text
        window_view = sliding_window_view(tokenized_modified_text_array, len(tokens))

        # Use np.all() with np.argwhere() to find matching windows
        matching_indices = np.argwhere(np.all(window_view == tokens_array, axis=1)).flatten()

        # Add NER tags based on the matching indices
        for start_index in matching_indices:
            end_index = start_index + len(tokens) - 1
            ner_label = entity_to_label_mapping[entity]
            ner_tags.append([int(start_index), int(end_index), ner_label]) #convert to int to allow for JSON serializability

  return ner_tags

In [38]:
def anonymized_text_to_training_data(original_text : str, name_generator : RandomNameGenerator, place_generator : RandomPlaceGenerator, organisation_generator : RandomOrganisationGenerator,verbose=False) -> dict:

    # Regex to find anonymized entities (e.g., A.________, AB.________AG, aaa.________ AG), which all contain exactly 8 underscores, followed by maybe a organisation indicator like AG
    #anonymized_pattern = r"(\S+)_{8}"
    #anonymized_pattern = r"(\S+_{8})(?:\s*(\S+))?" #modifications + -> *
    anonymized_pattern = r"([a-zA-Z]\S*_{8})(?:\s*(\S+))?"

    # Organization indicators
    organization_indicators = config['organization_endings']

    #strip non-alphabetic characters from organisations indicators
    organization_indicators = [strip_non_letters(indicator).lower() for indicator in organization_indicators] #lowercase

    entity_to_label_mapping = {}

    entity_to_suffix_mapping = {}

    for match in re.finditer(anonymized_pattern,original_text):
      entity = match.group(1) # returns the anonymized part ([a-zA-Z]\S*_{8})
      pre_suffix = match.group(2) # returns the next group of non-whitespace characters following the 8 underscores after one or more white-space characters, if applicable.

      # Skip entities that are already categorized
      if entity in entity_to_label_mapping.keys():
        continue

      #Check if entity represents a place:
      if entity[0] in ["U","V","W"] and entity.endswith(".________"):
        entity_to_label_mapping[entity] = 'location'

      elif pre_suffix and strip_non_letters(pre_suffix).lower() in organization_indicators:
        entity_to_label_mapping[entity] = 'organization'
        suffix = strip_non_letters(pre_suffix)
        entity_to_suffix_mapping[entity] = suffix

      else: entity_to_label_mapping[entity] = 'person'

    unique_entities = list(entity_to_label_mapping.keys())
    unique_name_entities = [entity for entity,label in entity_to_label_mapping.items() if label == 'person']
    unique_place_entities = [entity for entity,label in entity_to_label_mapping.items() if label == 'location']
    unique_orga_entities = [entity for entity,label in entity_to_label_mapping.items() if label == 'organization']

    # Create unique synthetic name
    synthetic_names = set()
    while len(synthetic_names) < len(unique_name_entities):
      synthetic_names.add(name_generator.get_first_name_last_name_string())
    synthetic_names = list(synthetic_names)

    # Create synthetic places
    synthetic_places = set()
    while len(synthetic_places) < len(unique_place_entities):
      synthetic_places.add(place_generator.get_place_string())
    synthetic_places = list(synthetic_places)

    # Create synthetic organisations
    synthetic_orgas = set()
    while len(synthetic_orgas) < len(unique_orga_entities):
      synthetic_orgas.add(organisation_generator.get_organisation_string())
    synthetic_orgas = list(synthetic_orgas)

    #order unique_entities to names, places, organisations
    unique_entities = unique_name_entities + unique_place_entities + unique_orga_entities

    #order synthetic to names, places, organisations
    synthetic = synthetic_names + synthetic_places + synthetic_orgas

    # Create a mapping of anonymized entities to synthetic names
    entity_to_name_mapping = dict(zip(unique_entities,synthetic))

    #get modified text
    modified_text = replace_entities(original_text,entity_to_name_mapping)

    # tokenize modified text
    tokenized_modified_text = tokenize_text(modified_text)

    #get ner_tags
    ner_tags = get_ner_labels(tokenized_modified_text,entity_to_name_mapping,entity_to_label_mapping)

    if verbose:
        print(f"Entity to label mapping: {entity_to_label_mapping}")
        print(f"Entity to suffix mapping: {entity_to_suffix_mapping}")
        print(f"Synthetic names: {synthetic_names}")
        print(f"Synthetic places: {synthetic_places}")
        print(f"Synthetic organisations: {synthetic_orgas}")
        print(f"Entity to name mapping: {entity_to_name_mapping}")


    # Output structure
    output = {
        "tokenized_text": tokenized_modified_text,
        "ner": ner_tags
    }

    return output

# Initialize random name generators

## Load datasets

In [39]:
first_names_df = pd.read_csv(raw_names_data_path + f"first_names_count.csv")
#drop rows where one or more values are nan
first_names_df = first_names_df.dropna()
first_names_df.sample(5,random_state=42)

Unnamed: 0,First_name,count
64972,Zorina,6
8363,Benyounes,5
15242,Edwige,193
48408,Relu,6
34703,Lynette,53


In [40]:
last_names_df = pd.read_csv(raw_names_data_path + f"last_names_count.csv")
last_names_df = last_names_df.dropna(how = 'any')
last_names_df.sample(5,random_state=42)

Unnamed: 0,Last_name,count
226418,Ruckelshausen,3
192985,Benkorachi,3
10390,Polanco,113
224473,Putta,3
196196,Cardwell,3


In [41]:
company_names_df = pd.read_csv(raw_names_data_path + f"organization_names.csv")
company_names_df = company_names_df.dropna(how = 'any')
company_names_df.loc[:, 'name'] = company_names_df['name'].str.title()
company_names_df.sample(5)

Unnamed: 0,name
19439,Dürr Ecoclean
80509,Sintex Servizi
294159,Parva Domus
204275,Cse-Guide.Fr
44723,Istituto Luce-Cinecitta Societa A Responsabi...


In [42]:
place_names_df = pd.read_csv(raw_names_data_path + f"location_names.csv")
place_names_df = place_names_df.dropna(how = 'any')
place_names_df.sample(5)

Unnamed: 0,name
8134,Spoleto
6126,Arsac
4182,Derendingen
2380,Basel
1974,Schmölln


In [29]:
#initialize the generators
name_generator = RandomNameGenerator(first_names_df,last_names_df,random_state = 42)
place_generator = RandomPlaceGenerator(place_names_df,random_state = 42)
organisation_generator = RandomOrganisationGenerator(company_names_df,random_state=42)

# Apply Function to Dataset

In [None]:
#Save as JSON

#first load the dataset df
df = pd.read_parquet(os.path.join(bge_data_path,"bger-2024-2-text.parquet"))

#only keep the "text" column
df = df[['text']]

#apply cutoff_clear_names and change column name to "text_cutoff"
df['text'] = df['text'].apply(cutoff_clear_names)
df.rename(columns={'text': 'text_cutoff'}, inplace=True)

#apply anonymized_text_to_training_data to all text in df['text_cutoff'] and save the resulting list of dicts to json

#use a for loop with tqdm to track progress
from tqdm import tqdm

# Open the output JSON file in write mode
with open(save_path + "None", 'w') as f:  #Enter valid filename
    # Write the opening bracket for the JSON array
    f.write('[')

    # Process the DataFrame row by row
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        # Call anonymized_text_to_training_data for the current row
        training_data = anonymized_text_to_training_data(
            row['text_cutoff'], name_generator, place_generator, organisation_generator
        )

        # Dump the training data to the JSON file
        json.dump(training_data, f, separators=(',', ':'))

        # Add a comma if it's not the last row
        if i < df.shape[0] - 1:
            f.write(',')

    # Write the closing bracket for the JSON array
    f.write(']')


100%|██████████| 124089/124089 [2:40:40<00:00, 12.87it/s]       
