# Exploration of the data

In [None]:
import pandas as pd
import csv
from tabulate import tabulate
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

file_path = "../data/data.txt"

### Extract the named entities

Extract the named entities for each label in the data into an array.

Prints them in a dataframe, together with the number of found entities for each label.

Also prints a dataframe with the unique entities for each label

In [None]:
def remove_duplicates(named_entities):
    for entity in named_entities:
        named_entities[entity][0] = list(set(named_entities[entity][0]))
        named_entities[entity][1] = len(named_entities[entity][0])
    
    return named_entities

In [None]:
def named_entities_to_dataframe(named_entities, cols):
    return pd.DataFrame.from_dict(named_entities, orient='index', columns=cols)

In [None]:
def extract_named_entities(file_path):
    with open(file_path, 'r') as f:
        text = f.read()

    labels = ['First_Name', 'Last_Name', 'Phone_Number', 'Age', 'Full_Date', 'Date_Part', 'Health_Care_Unit', 'Location']    
        
    found_entities = {}
    start_index = text.find("<")
    while start_index != -1:
        end_index = text.find(">", start_index)
        end_index2 = text.find("</", end_index)
        
        named_entity = text[start_index+1:end_index]
        named_entity_value = text[end_index+1:end_index2]
        
        end_index2 = text.find(">", end_index2)
        start_index = text.find("<", end_index2)

        if not named_entity in labels: continue

        if named_entity in found_entities:
            found_entities[named_entity][0].append(named_entity_value)
            found_entities[named_entity][1] += 1
        else:
            found_entities[named_entity] = [[named_entity_value], 1]
        
    return found_entities

In [None]:
def print_table(title, table):
    print("\n\n", title.upper(), "\n\n", table)

In [None]:
def print_values(label, df):
    values = df_unique.loc[label, "Value"]
    sorted_values = sorted(values, key=len)
    for val in sorted_values:
        print(val)

In [None]:
def print_unique(df_all, df_unique):
    print_data = {
        'Label': df_all.axes[0],
        'Entities': df_all.loc[:,"Count"],
        'Unique Entities': df_unique.loc[:,"Count"],
        '% Unique': round((df_unique.loc[:,"Count"] / df_all.loc[:,"Count"])*100, 1)
    }

    print_table("unique entities", tabulate(print_data, headers="keys"))

In [None]:
def print_most_common(df):
    results = []
    for label in df.axes[0]:
        entities = df.loc[label, "Value"]
        counter = Counter(entities)
        most_common_item = counter.most_common(1)[0]
        results.append([
            label,
            most_common_item[0],
            most_common_item[1],
            round(most_common_item[1] / len(entities) * 100, 2)
        ])
        
    print_table("most common entity", tabulate(results, headers=["Label", "Entity", "Count", "Percentage"]))

In [None]:
def print_most_common_six(df):
    results = []
    for label in df.axes[0]:
        entities = df.loc[label, "Value"]
        results.append([
            label,
            [item for item, count in Counter(entities).most_common(6)]
        ])
        
    print_table("six most common entities", tabulate(results, headers=["Label", "Entities"]))

In [None]:
def print_n_count(label, n):
    results = []
    entities = df_all.loc[label, "Value"]
    
    # Count the frequency of each word
    word_counts = Counter(entities)

    # Sort the words by frequency
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    # Extract the top n words
    top_n = sorted_words[:n]
    
    print_table(f"{n} most common: {label}", tabulate(top_n, headers=["Entity", "Count"]))
        

In [None]:
def print_wordclouds(label1, label2):
    words = df_all.loc[label1, "Value"]
    words2 = df_all.loc[label2, "Value"]

    word_frequency = dict(Counter(words))
    word_frequency2 = dict(Counter(words2))

    wordcloud1 = WordCloud(width = 800, height = 800, background_color='white',
                min_font_size = 10).generate_from_frequencies(word_frequency)
    
    wordcloud2 = WordCloud(width = 800, height = 800, background_color='white',
                min_font_size = 10).generate_from_frequencies(word_frequency2)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7), gridspec_kw={'width_ratios': [1, 1]})
    ax1.imshow(wordcloud1) 
    ax1.axis("off") 
    ax2.imshow(wordcloud2) 
    ax2.axis("off") 

    print("\n\nWORD CLOUDS")
    
    plt.show() 

In [None]:
def print_n_most_common_from_csv(n, file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter=';')
        headers = ['Name', 'Count']
        data = [row for row in reader if int(row[1]) >= 10000]

    data = sorted(data, key=lambda x: int(x[1]), reverse=True)
    print_table(f"{n} most common in Sweden", tabulate(data[:n], headers=headers))

In [None]:
named_entities = extract_named_entities(file_path)
df_all = named_entities_to_dataframe(named_entities, cols=['Value', 'Count'])


named_entities_unique_values = remove_duplicates(named_entities)
df_unique = named_entities_to_dataframe(named_entities_unique_values, cols=['Value', 'Count'])

In [None]:
print_table("dataframe with all entities", df_all)
print_table("dataframe with unique entities", df_unique)

print_unique(df_all, df_unique)
print_most_common(df_all)
print_most_common_six(df_all)

print_n_count("First_Name", 6)
print_n_count("Location", 6)


In [None]:
print_wordclouds("First_Name", "Last_Name")

In [None]:
print_values("Date_Part", df_unique)

In [None]:
print_values("Health_Care_Unit", df_unique)

Källa på namn SCB: https://www.scb.se/hitta-statistik/sverige-i-siffror/namnsok/

In [None]:
print_n_most_common_from_csv(10, "../data/last_names.csv")

In [None]:
print_n_most_common_from_csv(10, "../data/first_names_women.csv")

In [None]:
print_n_most_common_from_csv(10, "../data/first_names_men.csv")

## Observations...

When goin through the data set, we have noticed quite a few instances of abbriviations being used for words in normal language, such as:

- "hö" instead of "höger"
- "vä" instead of "vänster"
- "fr" instead of "från"
- "pat" instead of "patient"
- "beh" instead of "behandlas"
- "bed" instead of "bedömning" (?)
- "avd" instead of "avdelning"
- "bakt" instead of "bakterier / bakterie"
- "rel" instead of "relativt"
- "perm" instead of "permission"
- "mkt" instead of "mycket"
- "stud" instead of "student / studerande"
- "vb" instead of "vid behov" (?)
- "ang" instead of "angående"
- "enl" instead of "enligt"
- "enh" instead of "enhet" (?)

Also, there are instances of many medical-specific abbreviations:

- "UL" = "Ultraljud"
- "ADL" = "Aktiviteter i dagliga livet"
- "VAS" = "Visuell analog skala" (used for indicating percived pain from 0-10)
- "KOL" = "kroniskt obstruktiv lungsjukdom"
- "CIDP" = "Kronisk inflammatorisk demyeliniserande polyneuropati"
- "AF" = "andningsfrekvens"




In [None]:
def extract_words(file_name, length):
    with open(file_name, 'r') as file:
        lines = file.readlines()
    
    two_letter_words = []
    for line in lines:
        words = line.strip().split(' ')
        for word in words:
            if len(word) == length and word.isalpha():
                two_letter_words.append(word.lower())
    
    return two_letter_words

In [None]:
def check_and_remove(word_list, file_name):
    found_words = []
    with open(file_name, 'r') as file:
        for line in file:
            word = line.split(';')[0].lower()
            if word in word_list:
                word_list.remove(word)
                found_words.append(word)
    return found_words

In [None]:
words = extract_words(file_path, 3)
print(len(words))
unique_words = list(set(words))
print("unique:", len(unique_words))
found_words = check_and_remove(unique_words, '../data/terms.csv')
print("found in terms:", len(found_words))
print("unique after: ", len(unique_words))
print(unique_words)

#använd vokabulär över vanliga ord...

In [None]:
def plot_sentence_lengths(file_name):
    sentence_lengths = []
    with open(file_name, 'r') as file:
        for line in file:
            sentence_lengths.append(len(line.strip()))
    
    plt.hist(sentence_lengths, bins=50, range=[0, 900])
    plt.xlabel("Sentence Length")
    plt.ylabel("Number of Sentences")
    plt.title("Histogram of Sentence Lengths")
    plt.show()

In [None]:
plot_sentence_lengths(file_path)

In [None]:
def extract_first_word(file_name):
    with open(file_name, 'r') as file:
        first_words = [line.split()[0] if line.strip() else '' for line in file]
    return sorted(first_words)

In [None]:
categories = extract_first_word(file_path)
unique_categories = sorted(list(set(categories)))
print("total categories:",len(categories))
print("unique categories:", len(unique_categories))
print(unique_categories)