# Baseline: Dictionary search

This notebooks explores another type of baseline for the system.

In this notebook a dictionary is created for the training data that maps all the named entities to its true label. 

The dictionary is then used for searching for the correct label, using the test data. If no corresponding class is found in the dictionary, the token is predicted as 'O'.

In [None]:
labels = ['First_Name', 'Last_Name', 'Phone_Number', 'Age', 'Full_Date', 'Date_Part', 'Health_Care_Unit', 'Location']    

## Importing 

In [None]:
import os
import numpy as np
from dotenv import load_dotenv, find_dotenv
import sys

sys.path.append(os.path.dirname(find_dotenv()))

In [None]:
#Import the file_handler.py file
from py_scripts.file_handler import read_csv_file

#Read the data
X, Y = read_csv_file("clean.csv")

In [None]:
#Splitting the data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

## Creating dictionary from training data

In [None]:
#Create a dictionary/mapping of the labels from the training data
mapping = {}

for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if X_train[i][j] not in mapping and  Y_train[i][j] in labels:
            #map the word to the label
            mapping[X_train[i][j]] = Y_train[i][j]

In [None]:
#Predict the labels for the test data using the dictionary
Y_pred = []

for i in range(len(X_test)):
    Y_pred.append([])
    for j in range(len(X_test[i])):
        if X_test[i][j] in mapping:
            Y_pred[i].append(mapping[X_test[i][j]])
        else:
            Y_pred[i].append("O")

In [None]:
def flatten(list):
    return [item for sublist in list for item in sublist]

In [None]:
#Evaluate the model
from sklearn.metrics import classification_report

print(classification_report(flatten(Y_pred), flatten(Y_test), zero_division=1))