In [None]:
import random 
import re
from copy import deepcopy

from mimesis import Person, Generic, Address
from mimesis.enums import Gender
import pandas as pd

from tqdm import tqdm

In [None]:
d = pd.read_csv("../de-anon-app/data/first-thousand-anon-bearbeitet.csv")

In [None]:
# fix missing blanks
def fix_missing_blanks(text):
    text = re.sub('PERSON',' PERSON ', text)
    text = re.sub("DATE", " DATE ", text)
    text = re.sub('ORGANISATION',' ORGANISATION ', text)
    text = re.sub("LOCATION", " LOCATION ", text)
    return text

In [None]:
d["open_nps_reason"] = d["open_nps_reason"].apply(fix_missing_blanks)

In [None]:
def generate_person():
    person = Person('de')
    random_number = random.random()
    
    if random_number < .1:
        temp_name = person.full_name(gender=Gender.FEMALE)
    elif random_number > .1 and random_number < .2:
        temp_name = person.full_name(gender=Gender.MALE)
    elif random_number > .2:
        temp_name = f'{random.choice(["Herr", "Hr", "Frau", "Fr"])} {person.last_name()}'
    
    return temp_name


def generate_location():
    address = Address("de")
    random_number = random.random()
    
    if random_number < .8:
        temp_loc = address.city()
    elif random_number > .8 and random_number < .9:
        temp_loc = f"{address.postal_code()} {address.city()}"
    else: 
        temp_loc = f"{address.address()}, {address.postal_code()} {address.city()}"
    
    return temp_loc

    
def generate_organisation():
    return random.choice(["DHL", "Deutsche Post", "Post", "DPD", "Hermes", "UPS"])

def generate_date():
    g = Generic("de")
    return g.datetime.date(start=2020, end=2021).strftime("%Y-%m-%d")

def generate_mail():
    return "xxx.xxx@gmail.com"

In [None]:
a = "PERSON (DATE) am Eingang in LOCATION hat mich gleich gefragt (PERSON), wie ich nur bei ORG bestellen konnte am DATE"

In [None]:
def unify(text):
    
    found = any(re.findall(r"PERSON|ORGANISATION|LOCATION", text))
    
    if not found:
        return None, None
    
    orig_text = deepcopy(text)
    temp_persons = [generate_person() for _ in range(0, 20)]
    temp_orgs = [generate_organisation() for _ in range(0, 20)]
    # temp_dates = [generate_date() for _ in range(0, 20)]
    temp_locs = [generate_location() for _ in range(0, 20)]
    
    help_count_persons = 0
    help_count_orgs = 0
    help_count_dates = 0
    help_count_locs = 0
    
    run_number = 0
    
    while any(re.findall(r"PERSON|ORGANISATION|LOCATION", text)) and run_number <= 20:
                
        person_match = re.findall(r"PERSON", text)
        if person_match:
            text = text.replace(person_match[0], temp_persons[help_count_persons], 1)
            help_count_persons += 1

        org_match = re.findall(r"ORGANISATION", text)
        if org_match:
            text = text.replace(org_match[0], temp_orgs[help_count_orgs], 1)
            help_count_orgs += 1
            
       #  date_match = re.findall(r"DATE", text)
       #  if date_match:
       #      text = text.replace(date_match[0], temp_dates[help_count_dates], 1)
       #      help_count_dates += 1

        loc_match = re.findall(r"LOCATION", text)
        if loc_match:
            text = text.replace(loc_match[0], temp_locs[help_count_locs], 1)
            help_count_locs += 1
        
        run_number += 1
        if run_number > 10:
            print(f"run number: {run_number}")
            
            
    help_label_count_persons = 0
    help_label_count_orgs = 0
    help_label_count_dates = 0
    help_label_count_locs = 0
        
    temp_res = []
    for each in re.split('\s|!|\.|,|\\(|\\)|;', orig_text):
        if each == "PERSON":
            temp_res.append((temp_persons[help_label_count_persons], "PER"))
            help_label_count_persons += 1

        elif each == "ORGANISATION":
            temp_res.append((temp_orgs[help_label_count_orgs], "ORG"))
            help_label_count_orgs += 1

       #  elif each == "DATE":
       #      temp_res.append((temp_dates[help_label_count_dates], "DATE"))
       #      help_label_count_dates += 1

        elif each == "LOCATION":
            temp_res.append((temp_locs[help_label_count_locs], "LOC"))
            help_label_count_locs += 1

        else:
            temp_res.append((each, "O"))
    
    temp_res = [e for e in temp_res if len(e[0]) > 1]
            
    return text, temp_res


In [None]:
def split_and_apply_label(e):
    # print("split and apply labels")
    splitted = e[0].split()
    temp_res = [(each, e[1]) for each in splitted]
    return temp_res
    

In [None]:
reconstructed_data = []

for each in tqdm(d.itertuples()):
    
    new_text, labels = unify(each.open_nps_reason)
    
    if not new_text:
        continue
        
    new_splitted = [split_and_apply_label(e) for e in labels]

    flat_list = []
    for sublist in new_splitted:
        for item in sublist:
            flat_list.append(item)

    temp = {"response_id": each.response_id, 
           "open_nps_reason": each.open_nps_reason, 
           "open_nps_reason_reconstructed": new_text,
            "with_labels": flat_list
           }

    reconstructed_data.append(temp)

In [None]:
import pickle 

with open("../data/prepared/temp_eval.pickle", "wb") as fp:
    pickle.dump(reconstructed_data, fp)