# Data cleaning and preproccessing

In [None]:
import os
import sys
from dotenv import load_dotenv,find_dotenv

sys.path.append(os.path.dirname(find_dotenv()))
load_dotenv(find_dotenv())

#Get the path for the data
PATH = os.getenv('DATA_PATH')


In [None]:
from py_scripts.file_handler import write_csv_file

In [None]:
def clean_label_string(token,label_lower):
    token = token.replace('<' + label_lower + '>', '')
    token = token.replace('</' + label_lower + '>', '')
    return token

In [None]:
labels = ['First_Name', 'Last_Name', 'Phone_Number', 'Age', 'Full_Date', 'Date_Part', 'Health_Care_Unit', 'Location']    

In [None]:
def remove_duplicates(X,Y,keep_labels=True):
    X_unique = []
    Y_unique = []

    for x,y in zip(X,Y):
        if x not in X_unique:
            X_unique.append(x)
            Y_unique.append(y)
        #check if the list y contain named entity start with B-{entity}
        elif x in X_unique and any(["B-"+label in y for label in labels]):
            if(keep_labels):
                X_unique.append(x)
                Y_unique.append(y)

    return X_unique,Y_unique

In [None]:
import string
import re

def preprocessing(IOB=False,punctuation=string.punctuation,no_duplicates=False):
    #Load file and get lines
    with open(PATH) as f:
        documents = f.read().splitlines() 
    
    X = []
    Y = []

    #for couting the nr of inside labels
    nr_of_inside_labels = {
        'First_Name': 0,
        'Last_Name': 0,
        'Phone_Number': 0,
        'Age': 0,
        'Full_Date': 0,
        'Date_Part': 0,
        'Health_Care_Unit': 0,
        'Location': 0
    }

    for doc in documents:
        curr_X = []
        curr_Y = []
        
        #to lowercase
        doc = doc.lower()
        
        #add spaces between named entities
        doc = doc.replace(">","> ")
        doc = doc.replace("<"," <")

        #add spaces 
        doc = doc.replace("="," = ")
        doc = doc.replace("*"," * ")
        doc = doc.replace("+"," + ")
        
        doc = doc.replace("("," (")
        doc = doc.replace(")",") ")
        
        doc = doc.replace("->"," ->")




        #Add space after dot and comma when followed by a letter
        doc = re.sub(r'(?<=[.,:])(?=[a-zA-Z])', r' ', doc)
 
        #split string
        words = doc.split()
        
        #Skip empty lines
        if len(words) <= 1:
            continue
        
        named_entity = False
        inside_entity = False
        
        #loop over words, and mark each word as O or as their specific label
        for word in words:    

            #check if current token is a named entity
            if '<' in word[0] and '>' in word[-1]:
                
                #find the correct label
                for label in labels:
                    label_lower = label.lower()

                    #start of entity
                    if '<' + label_lower + '>' in word:
                        word = clean_label_string(word,label_lower)
                        named_entity = True
                        break
                    #end of entity
                    elif '</' + label_lower + '>' in word:
                        word = clean_label_string(word,label_lower)
                        named_entity = False
                        inside_entity = False
                        break
                        
            #skip if empty string
            word = word.strip()
            word = word.strip(punctuation)
            
            #skip if empty string
            if len(word) == 0:
                continue
            
            #start of named entity    
            if named_entity and not inside_entity:
                if IOB:
                    curr_Y.append('B-'+label)
                else:
                    curr_Y.append(label)
                curr_X.append(word)
                inside_entity = True

            #inside of named entity
            elif named_entity and inside_entity:
                nr_of_inside_labels[label] += 1

                if IOB:
                    curr_Y.append('I-'+label)
                else:
                    curr_Y.append(label)
                curr_X.append(word)
        
            #outside of named enitity
            else:     
                curr_Y.append('O')
                curr_X.append(word)

        X.append(curr_X)
        Y.append(curr_Y)
    
    #Remove duplicates
    if no_duplicates:
        X,Y = remove_duplicates(X,Y,keep_labels=True)

    return X,Y

In [None]:
#puncation without - and >
punctuation = string.punctuation.replace('-','')
punctuation = punctuation.replace('>','')

X, Y = preprocessing(IOB=True,punctuation=punctuation)
X_no_duplicates, Y_no_duplicates = preprocessing(IOB=True,punctuation=punctuation,no_duplicates=True)

print('Number of sentences: ',len(X))
print('Number of sentences without duplicates: ',len(X_no_duplicates))

In [None]:
write_csv_file(filename="clean_iob",X=X,Y=Y)
write_csv_file(filename="clean_iob_no_duplicates",X=X_no_duplicates,Y=Y_no_duplicates)