# Evalutation using different data sizes

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv, find_dotenv
import sys
import csv

sys.path.append(os.path.dirname(find_dotenv()))

load_dotenv(find_dotenv())

In [None]:
# Read in data from results/{model_name}/traindata.csv
# and evaluate the model

def read_files(dir):
    # Create paths for different models
    model_path = os.path.join(os.environ.get("RESULT_DIR"), dir)

    # Read all files in the directory
    model_files = os.listdir(model_path)

    # Create a list of all files that end with .csv
    model_csv = [file for file in model_files if file.endswith(".csv")]

    #the file name will be for example kb_bert_1.csv
    #we want to get the number 1
    data_sizes = [file.split(".")[0].split("_")[-1] for file in model_csv]
    
    #Read all csv files
    model_data = {}
    for i in range(len(model_csv)):
        file = model_csv[i]
        with open(os.path.join(model_path, file), "r") as f:
            reader = csv.reader(f)
            
            #first get the first row as header
            header = next(reader)

            #create a dictionary with the header as keys and column as values
            result = {}

            for h in header:
                result[h] = []

            #read the rest of the rows
            for row in reader:
                for h, v in zip(header, row):
                    result[h].append(v)
            
            #add the result to the dictionary
            model_data[data_sizes[i]] = result

    return model_data



In [None]:
kb_bert_res = read_files("kb_bert")
print(kb_bert_res)


In [None]:
#Create a plot for the results of the different models
#the plot will show the f1 score for the different entities on different data sizes
def plot_results(results, title):
    #get the different entity types
    entities = results["25"]["entity"].copy()
 
    #get the different data sizes
    data_sizes = list(results.keys())

    #sort data sizes, start with the smallest
    data_sizes.sort(key=int)

    #create a dictionary with the entity types as keys and the f1 scores as values
    entity_scores = {}
    for entity in entities:
        entity_scores[entity] = []

    #add the f1 scores for each entity type for each data size
    for size in data_sizes:
        for entity in entities:
            entity_scores[entity].append(float(results[size]["f1"][entities.index(entity)]))
    #create a plot for each entity type
    for entity in entities:
        plt.plot(data_sizes, entity_scores[entity], label=entity)

    plt.xlabel("Data size")
    plt.ylabel("F1 score")
    plt.title(title)
    plt.legend()
    plt.show()


In [None]:
plot_results(kb_bert_res, "kb_bert")