# Summary notebook

In [1]:
import os
import numpy as np
from dotenv import load_dotenv, find_dotenv
import matplotlib.pyplot as plt
import sys
import csv

sys.path.append(os.path.dirname(find_dotenv()))

load_dotenv(find_dotenv())

True

In [2]:
# Read in data from results/similarity/similarity_scores_{dataset}_data_{percentage}.csv
# and evaluate

def read_files(dataset):
    # Get path to similarity folder
    file_path = os.path.join(os.environ.get("RESULT_DIR"), 'similarity')

    # Read all files in the directory
    all_files = os.listdir(file_path)

    # Create a list of all files that contains the name of the dataset
    dataset_files = [file for file in all_files if dataset in file]

    #the file name will be for example similarity_scores_test_data_25.csv
    #we want to get the number 25
    data_sizes = [file.split(".")[0].split("_")[-1] for file in dataset_files]
    
    #Read all csv files
    result_data = {}
    for i in range(len(dataset_files)):
        file = dataset_files[i]
        with open(os.path.join(file_path, file), "r") as f:
            reader = csv.reader(f)
            
            #first get the first row as header
            header = next(reader)

            #create a dictionary with the header as keys and column as values
            result = {}

            for h in header:
                result[h] = []

            #read the rest of the rows
            for row in reader:
                for h, v in zip(header, row):
                    result[h].append(v)
            
            #add the result to the dictionary
            result_data[data_sizes[i]] = result

    return result_data

In [3]:
#Create a plot for the results of the different models
#the plot will show the f1 score for the different entities on different data sizes
def plot_results(results, title):
    #get the different entity types
    entities = results["25"]["entity"].copy()
 
    #get the different data sizes
    data_sizes = list(results.keys())

    #sort data sizes, start with the smallest
    data_sizes.sort(key=int)

    #create a dictionary with the entity types as keys and the f1 scores as values
    entity_scores = {}
    for entity in entities:
        entity_scores[entity] = []

    #add the f1 scores for each entity type for each data size
    for size in data_sizes:
        for entity in entities:
            entity_scores[entity].append(float(results[size]["f1"][entities.index(entity)]))
    #create a plot for each entity type
    for entity in entities:
        plt.plot(data_sizes, entity_scores[entity], label=entity)

    plt.xlabel("Data size")
    plt.ylabel("F1 score")
    plt.title(title)
    plt.legend()
    plt.show()

In [4]:
# Create three different plots for each Metric
# The plots will show the Mean, Avg_max and Avg_min scores for the different Metrics
def plot_similarity_scores(results, index, title):
    
    # get the data set sizes
    data_sizes = list(results.keys())
    
    score_titles = ['Mean', 'Max', 'Min']

    #create a dictionary with the metrics as keys and the scores as values
    metric_scores = {}
    for metric in score_titles:
        metric_scores[metric] = []

    #add the scores for each metric
    for size in data_sizes:
        for metric in score_titles:
            print(score_titles.index(metric))
            metric_scores[metric].append(float(results[size][metric][index]))
    
    print(metric_scores)
    #create a plot for each metric
    for score in score_titles:
        print(data_sizes,metric_scores[score])
        plt.plot(data_sizes, metric_scores[score], label=score)

    plt.xlabel("Data size")
    plt.ylabel("Score")
    plt.title(title)
    plt.legend()
    plt.show()

## Test dataset

In [9]:
test_res = read_files("test")
print(test_res)
metrics = test_res['25']['Metric']
print(metrics)

for index in range(0,3):
    print(f'Metric: {metrics[index]}')
    plot_similarity_scores(test_res, index, metrics[1])

{'50': {'Metric': ['Cos_sim', 'BLEU', 'Euclidean'], 'Mean': ['0.2801', '0.0197', '6.09'], 'Max': ['0.827', '0.3863', '27.973'], 'Min': ['-0.0444', '0.0', '1.0']}, '25': {'Metric': ['Cos_sim', 'BLEU', 'Euclidean'], 'Mean': ['0.2775', '0.0195', '6.0701'], 'Max': ['0.8011', '0.3484', '21.1327'], 'Min': ['-0.0362', '0.0', '1.0']}}
['Cos_sim', 'BLEU', 'Euclidean']
Metric: Cos_sim
0
1
2
0
1
2
{'Mean': [0.2801, 0.2775], 'Max': [0.827, 0.8011], 'Min': [-0.0444, -0.0362]}
['50', '25'] [0.2801, 0.2775]
['50', '25'] [0.827, 0.8011]
['50', '25'] [-0.0444, -0.0362]
Metric: BLEU
0
1
2
0
1
2
{'Mean': [0.0197, 0.0195], 'Max': [0.3863, 0.3484], 'Min': [0.0, 0.0]}
['50', '25'] [0.0197, 0.0195]
['50', '25'] [0.3863, 0.3484]
['50', '25'] [0.0, 0.0]
Metric: Euclidean
0
1
2
0
1
2
{'Mean': [6.09, 6.0701], 'Max': [27.973, 21.1327], 'Min': [1.0, 1.0]}
['50', '25'] [6.09, 6.0701]
['50', '25'] [27.973, 21.1327]
['50', '25'] [1.0, 1.0]
