In [2]:
base_path = "models/Baselines"

### Collect Results

In [3]:
# Importar pandas
import pandas as pd
import os

datasets = {"restaurants":["gijon", "barcelona", "madrid", "paris", "newyorkcity"],
            "pois":["barcelona", "madrid", "paris", "newyorkcity", "london"],
            "amazon":["fashion", "digital_music"]}

column_names = None
all_data = []

for dataset, subsets in datasets.items():
    for subset in subsets:
        # Definir el nombre del fichero
        path = f"/media/nas/pperez/code/TAVtext/{base_path}/{dataset}/{subset}/"
        # path+=[f for f in os.listdir(path) if ".log" in f][0]
        # Leer el fichero con pandas, saltando las primeras dos líneas y usando el separador |
        #df = pd.read_csv(path, skiprows=2, sep="|", comment="-", header=1)
        path+="results.csv"
        df = pd.read_csv(path)
        df = df.sort_values("F1@1", ascending=False).reset_index(drop=True)
        df.insert(0, "Position", df.index+1)
        df["Set"] = dataset
        df["Subset"] = subset

        if column_names is None: column_names = df.columns # ["Model"] + [c.strip() for c in df.columns[1:]]
        
        all_data.extend(df.to_records(index=False).tolist())

all_data = pd.DataFrame(all_data, columns=column_names)
all_data.to_excel(f"/media/nas/pperez/code/TAVtext/{base_path}/all_results.xlsx", index=False)

### Tests

In [9]:
from src.Common import print_b, print_e
from src.datasets.text_datasets.RestaurantDataset import RestaurantDataset
from src.datasets.text_datasets.AmazonDataset import AmazonDataset
from src.datasets.text_datasets.POIDataset import POIDataset

from cornac.eval_methods import BaseMethod
from cornac.data.text import BaseTokenizer
from cornac.data import ReviewModality

import tensorflow as tf
import pandas as pd
import numpy as np
import nvgpu
import json

gpu = int(np.argmin(list(map(lambda x: x["mem_used_percent"], nvgpu.gpu_info())))) 

def load_set(dataset, subset, model = "ATT2ITM"):
    best_model = pd.read_csv("models/best_models.csv")
    best_model = best_model.loc[(best_model.dataset == dataset) & (best_model.subset == subset) & (best_model.model == model)]["model_md5"].values[0]
    model_path = f"models/{model}/{dataset}/{subset}/{best_model}"
    with open(f'{model_path}/cfg.json') as f: model_config = json.load(f)
    dts_cfg = model_config["dataset_config"]
    with open(f'{model_path}/cfg.json') as f: model_config = json.load(f)
    mdl_cfg = {"model": model_config["model"], "session": {"gpu": gpu, "mixed_precision": False, "in_md5": False}}

    print_b(f"Loading best model: {best_model}")

    if dataset == "restaurants":
        # text_dataset = RestaurantDataset(dts_cfg, load=["TRAIN_DEV", "TEXT_TOKENIZER", "TEXT_SEQUENCES", "WORD_INDEX", "VOCAB_SIZE", "MAX_LEN_PADDING", "N_ITEMS", "FEATURES_NAME", "BOW_SEQUENCES"])
        text_dataset = RestaurantDataset(dts_cfg)
    elif dataset == "pois":
        text_dataset = POIDataset(dts_cfg)
    elif dataset == "amazon":
        text_dataset = AmazonDataset(dts_cfg)
    else:
        raise ValueError


    all_data = pd.read_pickle(f"{text_dataset.DATASET_PATH}ALL_DATA")
    all_data["rating"]/=10
    all_data=all_data[["userId", "id_item", "rating", "dev", "test", "text"]]

    # Eliminar usuarios desconocidos y dividir en 3 subconjuntos
    train_data = all_data[(all_data["dev"] == 0) & (all_data["test"] == 0)]
    train_users = train_data["userId"].unique()
    id_user, userId = pd.factorize(train_data["userId"])
    user_map = pd.DataFrame(zip(userId, id_user), columns=["userId", "id_user"])
    val_data = all_data[(all_data["dev"] == 1) & (all_data["userId"].isin(train_users))]
    test_data = all_data[(all_data["test"] == 1) & (all_data["userId"].isin(train_users))]

    train_data = train_data.merge(user_map)[["id_user", "id_item", "rating"]]
    val_data = val_data.merge(user_map)[["id_user", "id_item", "rating"]].drop_duplicates(subset=["id_user", "id_item"], keep='last', inplace=False)
    test_data = test_data.merge(user_map)[["id_user", "id_item", "rating"]].drop_duplicates(subset=["id_user", "id_item"], keep='last', inplace=False)

    # Instantiate a Base evaluation method using the provided train and test sets
    eval_method = BaseMethod.from_splits(train_data=train_data.to_records(index=False), val_data=val_data.to_records(index=False), test_data=test_data.to_records(index=False),  verbose=False, rating_threshold=3)
    # Ojo, lo anterior elimina las repeticiones de USUARIO, ITEM

    # max_vocab = 3000
    # max_doc_freq = 0.5
    # tokenizer = BaseTokenizer()
    # reviews = all_data.drop_duplicates(subset=["userId", "id_item"], keep='last', inplace=False).merge(user_map)[["id_user", "id_item", "text"]].to_records(index=False).tolist()
    # eval_method = BaseMethod.from_splits(train_data=train_data.to_records(index=False), review_text=rm, val_data=val_data.to_records(index=False), test_data=test_data.to_records(index=False),  verbose=True, rating_threshold=3)

    return text_dataset, eval_method

In [None]:
from cornac.metrics import Recall, Precision, FMeasure
from cornac.experiment import Experiment
import cornac

seed = 2048

metrics = [
    FMeasure(k=1), FMeasure(k=5), FMeasure(k=10),
    Recall(k=1), Recall(k=5), Recall(k=10),
    Precision(k=1), Precision(k=5), Precision(k=10)
    ]

models = [
    cornac.models.MostPop(),
    cornac.models.BPR(seed=seed),
    cornac.models.EASE(seed=seed)
]

model = "ATT2ITM"
datasets = {"restaurants":["gijon", "barcelona"]}

for dataset, subsets in datasets.items():
    for subset in subsets:
        text_dataset, eval_method = load_set(dataset, subset)
        test_result = Experiment(
            eval_method=eval_method,
            show_validation=False,
            models=models,
            metrics=metrics,
            save_dir=f"{base_path}/{dataset}/{subset}", 
            verbose=True
        ).run()

### GridSearch

In [None]:

from cornac.metrics import Recall, Precision, FMeasure
from cornac.hyperopt import GridSearch, Discrete
from cornac.experiment import Experiment
import cornac

seed = 2048

dataset = "restaurants"
subset = "barcelona"

metrics = [
    FMeasure(k=1), FMeasure(k=5), FMeasure(k=10),
    Recall(k=1), Recall(k=5), Recall(k=10),
    Precision(k=1), Precision(k=5), Precision(k=10)
    ]

_, eval_method = load_set(dataset, subset)

md_bpr = cornac.models.BPR(seed=seed, verbose=True) #  k=50, max_iter=200, learning_rate=0.001, lambda_reg=0.001, verbose=True
md_ease = cornac.models.EASE(seed=seed, verbose=True) 

models = [
    GridSearch(
        model=md_bpr, space=[ 
            Discrete("k", [25, 50, 75]), 
            Discrete("max_iter", [50, 100, 200]), 
            Discrete("learning_rate", [1e-4, 5e-4, 1e-3]), 
        ], metric=FMeasure(k=1), eval_method=eval_method),
    GridSearch(
        model=md_ease, space=[
            Discrete("posB", [True, False]),
        ], metric=FMeasure(k=1), eval_method=eval_method),
    ]

# Put everything together into an experiment and run it
test_result = Experiment(
    eval_method=eval_method,
    models=models,
    metrics=metrics,
    user_based=False,
    save_dir=f"{base_path}/{dataset}/{subset}", 
    verbose=True
).run()

print(test_result)

### Prueba con RatioSplit
Para ver si aprende mejor que con nuestros datos ya divididos

In [103]:
from locale import setlocale, LC_TIME
import pandas as pd
import numpy as np
import cornac
import os

city = "madrid"

setlocale(LC_TIME, 'es_ES.UTF-8')

seed=2032
data_path = f"/media/nas/datasets/tripadvisor/restaurants/{city}/reviews.pkl"
data = pd.read_pickle(data_path)
# Ordenar por fecha (- a +) y quedarse con la última (si hay repeticiones)
data["date"] =  pd.to_datetime(data["date"] , format='%d de %B de %Y')
data["timestamp"] = data["date"].values.astype(np.int64) // 10 ** 9
data = data.sort_values("date").reset_index(drop=True)
data = data.drop_duplicates(subset=["userId", "restaurantId"], keep='last', inplace=False)

feedback = list(zip(data["userId"], data["restaurantId"], data["rating"]/10))
reviews = list(zip(data["userId"], data["restaurantId"], data["text"].values.tolist()))

cold_start = False
eval_method = cornac.eval_methods.RatioSplit(data=feedback, test_size=0.1, val_size=0.1, exclude_unknowns=not cold_start, verbose=False, seed=123, rating_threshold=3)

In [105]:
from cornac.metrics import Recall, Precision, FMeasure, NDCG, RMSE, MSE
from cornac.hyperopt import GridSearch, Discrete
from cornac.experiment import Experiment

metrics = [
    FMeasure(k=1), FMeasure(k=5), FMeasure(k=10),
    Recall(k=1), Recall(k=5), Recall(k=10),
    Precision(k=1), Precision(k=5), Precision(k=10),
    NDCG(), NDCG(k=1), NDCG(k=10),
    ]

md_bpr = cornac.models.BPR(seed=seed, verbose=True)
md_ease = cornac.models.EASE(seed=seed, verbose=True)

models = [
    cornac.models.MostPop(),
    # GridSearch( model=md_bpr, space=[ Discrete("k", [25, 50]), Discrete("max_iter", [50, 100]), Discrete("learning_rate", [1e-4, 5e-4, 1e-3]), ], metric=NDCG(), eval_method=eval_method),
    # GridSearch( model=md_ease, space=[ Discrete("posB", [True, False]), ], metric=NDCG(), eval_method=eval_method),
    cornac.models.BPR(seed=seed, k=25, learning_rate=0.0005, max_iter=50),  # Best parameter settings: {'k': 25, 'learning_rate': 0.0005, 'max_iter': 50}
    cornac.models.EASE(seed=seed, posB=True),
    cornac.models.MF(seed=seed),  # Best parameter settings: {'k': 30, 'learning_rate': 5e-06, 'max_iter': 10}
    cornac.models.WBPR(seed=seed),
    #cornac.models.MMMF(seed=seed),  # Best parameter settings: {'k': 5, 'learning_rate': 0.001, 'max_iter': 50}
    #cornac.models.NeuMF(seed=seed),
    ## cornac.models.WBPR(seed=seed),
    #cornac.models.FM(seed=seed),
    #cornac.models.HPF(seed=seed),
    #cornac.models.NMF(seed=seed),
    #cornac.models.PMF(seed=seed),
    #cornac.models.SKMeans(seed=seed),
    #cornac.models.SVD(seed=seed),
    #cornac.models.WMF(seed=seed),
]

experiment = Experiment(
    eval_method=eval_method,
    show_validation=False,
    models=models,
    metrics=metrics,
    verbose=True,
    user_based=False,
)

experiment.run()


TEST:
...
        |   F1@1 |  F1@10 |   F1@5 | NDCG@-1 | NDCG@1 | NDCG@10 | Precision@1 | Precision@10 | Precision@5 | Recall@1 | Recall@10 | Recall@5 | Train (s) | Test (s)
------- + ------ + ------ + ------ + ------- + ------ + ------- + ----------- + ------------ + ----------- + -------- + --------- + -------- + --------- + --------
MostPop | 0.0135 | 0.0128 | 0.0147 |  0.1552 | 0.0159 |  0.0334 |      0.0159 |       0.0074 |      0.0096 |   0.0127 |    0.0578 |   0.0378 |    0.0003 |  31.7415
BPR     | 0.0135 | 0.0128 | 0.0147 |  0.1553 | 0.0159 |  0.0334 |      0.0159 |       0.0074 |      0.0096 |   0.0127 |    0.0578 |   0.0378 |    2.2495 |  39.6140
EASEᴿ   | 0.0220 | 0.0168 | 0.0204 |  0.1658 | 0.0248 |  0.0472 |      0.0248 |       0.0097 |      0.0132 |   0.0209 |    0.0768 |   0.0531 |    4.1852 |  39.0607
MF      | 0.0001 | 0.0002 | 0.0001 |  0.0987 | 0.0001 |  0.0003 |      0.0001 |       0.0001 |      0.0001 |   0.0001 |    0.0007 |   0.0003 |    0.1750 |  41.5808
WBPR 

In [106]:
user, times = np.unique(eval_method.train_set.uir_tuple[0], return_counts=True)
user_train_items = pd.DataFrame(zip(user, times), columns=["user", "train_rvws"])

for result in experiment.result:
    result_model = result.model_name
    result_data = result.metric_user_results
    result_metrics = list(result_data.keys())
    
    model_user_results = pd.DataFrame(result_data).reset_index().rename(columns={"index":"user"}).merge(user_train_items, how="left")
    model_user_results = model_user_results.groupby("train_rvws")[result_metrics].mean().reset_index()
    model_user_results.to_excel(f"{result_model}.xlsx")

In [101]:
model_user_results[result_metrics].expanding().mean().to_excel(f"{result_model}.xlsx")

