# Summary notebook

In [None]:
import os
import numpy as np
from dotenv import load_dotenv, find_dotenv
import matplotlib.pyplot as plt
import sys
import csv

sys.path.append(os.path.dirname(find_dotenv()))

load_dotenv(find_dotenv())

In [None]:
# Read in data from results/similarity/similarity_scores_{dataset}_data_{percentage}.csv
# and evaluate

def read_files(dataset):
    # Get path to similarity folder
    file_path = os.path.join(os.environ.get("RESULT_DIR"), 'similarity')

    # Read all files in the directory
    all_files = os.listdir(file_path)

    # Create a list of all files that contains the name of the dataset
    dataset_files = [file for file in all_files if dataset in file]

    #the file name will be for example similarity_scores_test_data_25.csv
    #we want to get the number 25
    data_sizes = [file.split(".")[0].split("_")[-1] for file in dataset_files]
    
    #Read all csv files
    result_data = {}
    for i in range(len(dataset_files)):
        file = dataset_files[i]
        with open(os.path.join(file_path, file), "r") as f:
            reader = csv.reader(f)
            
            #first get the first row as header
            header = next(reader)

            #create a dictionary with the header as keys and column as values
            result = {}

            for h in header:
                result[h] = []

            #read the rest of the rows
            for row in reader:
                for h, v in zip(header, row):
                    result[h].append(v)
            
            #add the result to the dictionary
            result_data[data_sizes[i]] = result

    return result_data

In [None]:
# Create three different plots for each Metric
# The plots will show the Mean, Avg_max and Avg_min scores for the different Metrics
def plot_similarity_scores(results, index, title):
    
    # get the data set sizes
    data_sizes = list(results.keys())
    
    #sort data sizes, start with the smallest
    data_sizes.sort(key=int)
    
    score_titles = ['Mean', 'Max', 'Min']

    #create a dictionary with the metrics as keys and the scores as values
    metric_scores = {}
    for metric in score_titles:
        metric_scores[metric] = []

    #add the scores for each metric
    for size in data_sizes:
        for metric in score_titles:
            metric_scores[metric].append(float(results[size][metric][index]))
    
    #create a plot for each metric
    for score in score_titles:
        plt.plot(data_sizes, metric_scores[score], label=score)

    plt.xlabel("Data size")
    plt.ylabel("Score")
    plt.title(title)
    plt.legend()
    plt.show()

## Test dataset

In [None]:
test_res = read_files("test")
metrics = test_res['25']['Metric']

for index, metric in enumerate(metrics):
    plot_similarity_scores(test_res, index, metric)

## Val dataset

In [None]:
test_res = read_files("val")
metrics = test_res['25']['Metric']

for index, metric in enumerate(metrics):
    plot_similarity_scores(test_res, index, metric)