This notebook is used to create the meta-dataset for all reczilla meta-learner experiments.

In [80]:
# define the results file here. this file should be created by the script process_inbox.py
results_csv = "/Users/duncan/research/active_projects/reczilla/results/results.csv"

# name of the resulting meta-dataset
meta_dataset_name = "metadata-v0"

# define a function that takes a row as input an returns True if the row should be included in the meta-dataset and false otherwise
def include_row(row):
    if row["experiment_name"].startswith("full-experiment-") and ("test" not in row["experiment_name"]):
        return True
    else:
        return False


In [81]:
import pandas as pd
import numpy as np

df = pd.read_csv(results_csv, sep=";")

## Filter the results based on the function "include_row"

In [None]:
### keep experiments indicated by function include_row
experiment_prefix = "full-experiment-"
keep_rows = df.apply(include_row, axis=1)
df_expt = df.loc[keep_rows, :]

### if there are any rows with no metrics, drop them...
metric_col_list = [c for c in df_expt.columns if c.startswith("test_metric_") or c.startswith("val_metric_")]
df_expt.loc[:, "all_na_metrics"] = df_expt[metric_col_list].isna().all(axis=1)
df_expt = df_expt.loc[~df_expt["all_na_metrics"], :]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### sanity check: make sure that for each alg_name + hyperparameters_source combination all hyperparams are the same. 

**Note:** if any warnings are printed here, we need to investigate. 

In [None]:
param_list = [c for c in df_expt.columns if c.startswith("param_")]

for param_name in param_list:
    x = df_expt.groupby(["alg_name", "hyperparameters_source"])[param_name].agg(
        {"one_unique_param": lambda x: len(x.unique()) == 1}
    ).reset_index()
    if not all(x["one_unique_param"]):
        print(f"WARNING: for param {param_name}, params aren't uniquely defined:")
        print(x[~x["one_unique_param"]])

# use a line like this to diagnose issues:
# df_expt[(df_expt["alg_name"] == "ItemKNNCF_asymmetric") & (df_expt["hyperparameters_source"] == "random_0")][["experiment_name"] + param_list]

is deprecated and will be removed in a future version
  """


### final cleanup:
- define columns "alg_family" (e.g., UserKNN) and "parameterized_alg_name" (e.g., UserKNN_hparams_1). 
- param cols
- metric cols
- runtime
- dataset name
- dataset split name
- dataset source from gcloud

In [None]:
keep_cols = [
    "alg_name",
    "dataset_name",
    "split_name",
    "original_split_path",
    "hyperparameters_source", 
    "time_on_val",
    "time_on_test",
    "time_on_train",
] + metric_col_list + param_list

final_df = df_expt.loc[:, keep_cols]

final_df.rename(columns={"alg_name": "alg_family"}, inplace=True)

# assign a unique identifier for each alg+hyperparam pair
final_df.loc[:, "alg_param_name"] = ""

sep = ":"

# for all algs except for KNN, this is straightforward:
knn_rows = final_df["alg_family"].str.contains("KNN")
final_df.loc[~knn_rows, "alg_param_name"] = final_df.loc[~knn_rows, :].apply(lambda x: x["alg_family"] + sep + x["hyperparameters_source"], axis=1)

# for KNN rows, the alg family is UserKNN or ItemKNN, not including similarity...
knn_basename = final_df.loc[knn_rows, "alg_family"].apply(lambda x: x.split("_")[0])
knn_sim = final_df.loc[knn_rows, "alg_family"].apply(lambda x: x.split("_")[1])
final_df.loc[knn_rows, "alg_family"] = knn_basename  # either UserKNN or ItemKNN
final_df.loc[knn_rows, "alg_param_name"] = knn_basename + sep + knn_sim + "_" + final_df.loc[knn_rows, "hyperparameters_source"]


# final_df.loc[:, "alg_param_name"] = final_df.apply(lambda x: x["alg_family"] + ":" + x["hyperparameters_source"], axis=1)

In [None]:
# write the dataset to file
out_file = f"./meta_datasets/{meta_dataset_name}.pkl"

final_df.to_pickle(out_file)

# to read this file, do:
# final_df_2 = pd.read_pickle(out_file)

### Basic numbers

In [None]:
print(f"total number of parameterized algs: {len(final_df['alg_param_name'].unique())}")

total number of parameterized algs: 2406
