## Obtener de cada modelo el tiempo de ejecución y los parámetros

In [None]:
# Importar pandas
from src.experiments.Common import load_best_model
from pathlib import Path
import pandas as pd
import os

base_path = "models/Baselines"

datasets = {"restaurants":["gijon", "barcelona", "madrid", "paris", "newyorkcity"],
            "pois":["barcelona", "madrid", "paris", "newyorkcity", "london"],
            "amazon":["fashion", "digital_music"]}

models_cold = ["MOSTPOP2ITM", "BOW2ITM", "USEM2ITM", "BERT2ITM"]

In [5]:
def process_baseline_file(filepath):
    
    def process_section(lines):
        # Obtener las columnas
        columns = lines[1].split("|")
        columns = [col.strip() for col in columns]
        
        # Obtener los datos
        data = []
        for line in lines[3:]:
            if line.strip() == '':
                continue
            row = line.split("|")
            row = [item.strip() for item in row]
            data.append(row)
        
        # Crear el DataFrame
        df = pd.DataFrame(data, columns=columns)
        return df

    # Leer el archivo
    with open(filepath, 'r') as file: lines = file.readlines()

    # Separar las secciones
    validation_start = lines.index('VALIDATION:\n')
    test_start = lines.index('TEST:\n')

    validation_lines = lines[validation_start + 1:test_start]
    test_lines = lines[test_start + 1:]

    # Procesar cada sección
    validation_df = process_section(validation_lines)
    test_df = process_section(test_lines)
    
    test_df.rename(columns={"":"Model", "Train (s)":"Train_time"}, inplace=True)
    
    model_paper_names = {"GridSearch_EASEᴿ":"EASEᴿ", "GridSearch_BPR":"BPR", "online_ibpr": "IBPR"}
    test_df["Model"] = test_df["Model"].apply(lambda x: model_paper_names[x] if x in model_paper_names.keys() else x)

    return test_df[["Model","Train_time"]]

In [None]:
time_data = []
param_data = []

for dataset, subsets in datasets.items():
    for subset in subsets:
        # Definir el nombre del fichero
        path = f"/media/nas/pperez/code/TAVtext/{base_path}/{dataset}/{subset}/"
        # Obtener el fichero con los tiempos
        paths = sorted(Path(path).iterdir(), key=os.path.getmtime)
        file = [str(f) for f in paths if "CornacExp" in f.name][-1]
        # Cargar el fichero y leer los tiempos
        model_times = process_baseline_file(file)
        # Para los modelos no baseline, hacer lo mismo
        for model in models_cold:
            # Cargar mejor modelo
            model_class = load_best_model(model=model, dataset=dataset, subset=subset)
            # Obtener el número de parámetros
            param_data.append({"Set":dataset, "Subset":subset, "Model": model, "Params":model_class.MODEL.count_params()})
            # Obtener el tiempo de train
            time = pd.read_csv(model_class.MODEL_PATH+"log.csv")["e_time"].sum()
            model_times = model_times.append({"Model":model, "Train_time":time}, ignore_index=True)
        # Crear una linea para el dataframe final de resultados
        time_data_columns = ["Set", "Subset"]+model_times["Model"].values.tolist()
        time_data_values = [dataset, subset]+model_times.set_index("Model").transpose().values[0].tolist()
        time_data.append(dict(zip(time_data_columns, time_data_values)))

time_results = pd.DataFrame(time_data)
param_results = pd.DataFrame(param_data).pivot_table(index=["Set", "Subset"], columns=["Model"], values=["Params"])

In [21]:
# time_results
time_results.to_excel("time_results.xlsx")
param_results.to_excel("param_results.xlsx")