# Exploration of the data

In [None]:
import pandas as pd
from tabulate import tabulate
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

file_path = "../data/data.txt"

### Extract the named entities

Extract the named entities for each label in the data into an array.

Prints them in a dataframe, together with the number of found entities for each label.

Also prints a dataframe with the unique entities for each label

In [None]:
def remove_duplicates(named_entities):
    for entity in named_entities:
        named_entities[entity][0] = list(set(named_entities[entity][0]))
        named_entities[entity][1] = len(named_entities[entity][0])
    
    return named_entities

In [None]:
def named_entities_to_dataframe(named_entities, cols):
    return pd.DataFrame.from_dict(named_entities, orient='index', columns=cols)

In [None]:
def extract_named_entities(file_path):
    with open(file_path, 'r') as f:
        text = f.read()

    labels = ['First_Name', 'Last_Name', 'Phone_Number', 'Age', 'Full_Date', 'Date_Part', 'Health_Care_Unit', 'Location']    
        
    found_entities = {}
    start_index = text.find("<")
    while start_index != -1:
        end_index = text.find(">", start_index)
        end_index2 = text.find("</", end_index)
        
        named_entity = text[start_index+1:end_index]
        named_entity_value = text[end_index+1:end_index2]
        
        end_index2 = text.find(">", end_index2)
        start_index = text.find("<", end_index2)

        if not named_entity in labels: continue

        if named_entity in found_entities:
            found_entities[named_entity][0].append(named_entity_value)
            found_entities[named_entity][1] += 1
        else:
            found_entities[named_entity] = [[named_entity_value], 1]
        
    return found_entities

In [None]:
def print_table(title, table):
    print("\n\n", title.upper(), "\n\n", table)

In [None]:
def print_values(label, df):
    values = df_unique.loc[label, "Value"]
    sorted_values = sorted(values, key=len)
    for val in sorted_values:
        print(val)

In [None]:
def print_unique(df_all, df_unique):
    print_data = {
        'Label': df_all.axes[0],
        'Entities': df_all.loc[:,"Count"],
        'Unique Entities': df_unique.loc[:,"Count"],
        '% Unique': round((df_unique.loc[:,"Count"] / df_all.loc[:,"Count"])*100, 1)
    }

    print_table("unique entities", tabulate(print_data, headers="keys"))

In [None]:
def print_most_common(df):
    results = []
    for label in df.axes[0]:
        entities = df.loc[label, "Value"]
        counter = Counter(entities)
        most_common_item = counter.most_common(1)[0]
        results.append(
            [
                label,
                most_common_item[0],
                most_common_item[1],
                round(most_common_item[1] / len(entities) * 100, 2)
            ]
        )
        
    print_table("most common entity", tabulate(results, headers=["Label", "Entity", "Count", "Percentage"]))

In [None]:
def print_most_common_six(df):
    results = []
    for label in df.axes[0]:
        entities = df.loc[label, "Value"]
        results.append(
            [
                label,
                [item for item, count in Counter(entities).most_common(6)]
            ]
        )
        
    print_table("six most common entities", tabulate(results, headers=["Label", "Entities"]))

In [None]:
def print_wordcloud(label1, label2):
    words = df_all.loc[label1, "Value"]
    words2 = df_all.loc[label2, "Value"]

    word_frequency = dict(Counter(words))
    word_frequency2 = dict(Counter(words2))

    wordcloud1 = WordCloud(width = 800, height = 800, background_color='white',
                min_font_size = 10).generate_from_frequencies(word_frequency)
    
    wordcloud2 = WordCloud(width = 800, height = 800, background_color='white',
                min_font_size = 10).generate_from_frequencies(word_frequency2)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7), gridspec_kw={'width_ratios': [1, 1]})
    ax1.imshow(wordcloud1) 
    ax1.axis("off") 
    ax2.imshow(wordcloud2) 
    ax2.axis("off") 

    print("\n\nWORD CLOUDS")
    
    plt.show() 

In [None]:
named_entities = extract_named_entities(file_path)
df_all = named_entities_to_dataframe(named_entities, cols=['Value', 'Count'])


named_entities_unique_values = remove_duplicates(named_entities)
df_unique = named_entities_to_dataframe(named_entities_unique_values, cols=['Value', 'Count'])

In [None]:
print_table("dataframe with all entities", df_all)
print_table("dataframe with unique entities", df_unique)

print_unique(df_all, df_unique)
print_most_common(df_all)
print_most_common_six(df_all)

print_wordcloud("First_Name", "Last_Name")

In [None]:
print_values("Date_Part", df_unique)

In [None]:
print_values("Full_Date", df_unique)