# Data cleaning and preproccessing

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

#Get the path for the data
PATH = os.getenv('DATA_PATH')

In [None]:
import pandas as pd

def write_to_csv(filename="cleaned",X=[],Y=[]):
    result = pd.DataFrame({"X": X, "Y": Y})
    result.to_csv("../data/"+filename+".csv")

In [None]:
def clean_label_string(token,label_lower):
    token = token.replace('<' + label_lower + '>', '')
    token = token.replace('</' + label_lower + '>', '')
    return token

In [None]:
labels = ['First_Name', 'Last_Name', 'Phone_Number', 'Age', 'Full_Date', 'Date_Part', 'Health_Care_Unit', 'Location']    

In [None]:
import string

#Load file and get lines
with open(PATH) as f:
    documents = f.read().splitlines() 
   
X = []
Y = []

#for couting the nr of inside labels
nr_of_inside_labels = {
    'First_Name': 0,
    'Last_Name': 0,
    'Phone_Number': 0,
    'Age': 0,
    'Full_Date': 0,
    'Date_Part': 0,
    'Health_Care_Unit': 0,
    'Location': 0
}

for doc in documents:
    curr_X = []
    curr_Y = []
    
    #to lowercase
    doc = doc.lower()
    
    #add spaces between named entities
    doc = doc.replace(">","> ")
    doc = doc.replace("<"," <")
    
    #split string
    words = doc.split()
    
    #Skip empty lines
    if len(words) <= 1:
        continue
    
    named_entity = False
    inside_entity = False
    
    #loop over words, and mark each word as O or as their specific label
    for word in words:    

        #check if current token is a named entity
        if '<' in word[0] and '>' in word[-1]:
            
            #find the correct label
            for label in labels:
                label_lower = label.lower()

                #start of entity
                if '<' + label_lower + '>' in word:
                    word = clean_label_string(word,label_lower)
                    named_entity = True
                    break
                #end of entity
                elif '</' + label_lower + '>' in word:
                    word = clean_label_string(word,label_lower)
                    named_entity = False
                    inside_entity = False
                    break
                    
        #skip if empty string
        if len(word) == 0:
            continue
            
        word = word.strip()
        
        #start of named entity    
        if named_entity and not inside_entity:
            curr_Y.append(label)
            curr_X.append(word)
            inside_entity = True

        #inside of named entity
        elif named_entity and inside_entity:
            nr_of_inside_labels[label] += 1

            curr_Y.append(label)
            curr_X.append(word)
    
        #outside of named enitity
        else:     
            curr_Y.append('O')
            curr_X.append(word)

    X.append(curr_X)
    Y.append(curr_Y)

In [None]:
print("Number of inside labels: ", nr_of_inside_labels)

In [None]:
write_to_csv(X=X,Y=Y)