# Create config file

In [None]:
import yaml
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
SEED = 2023

: 

In [None]:
pip list --format=freeze > ./requirements.txt

In [3]:
parameter_dict = {
        
    # environment
    'seed': SEED,
    'reproducibility': True,
    'data_path': 'dataset/collections/',
    'checkpoint_dir': 'saved/',
    'show_progress': True,
    'save_dataset': False,
    'log_wandb': True,
    'save_dataloaders': False,
    
    # data
    'field_separator': '\t',
    'seq_separator': ' ',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    
    # training
    'epochs': 50,
    'train_batch_size': 2048, # 2048
    'learner': 'adam',
    'learning_rate': 0.001, # 0.001
    'train_neg_sample_args': {'distribution': 'popularity',
                              'sample_num': 5,
                              'dynamic': False,
                              'candidate_num': 0},
    'eval_step': 1,
    'stopping_step': 3000, # 15
    'loss_decimal_place': 4,
    
    # evaluation
    'eval_args': {'group_by': 'user',
                #   'order': 'MY',
                #   'split': {'MY':'dataset/collections/azuki/split_indices.pkl'},
                  'mode': 'pop100'},
    'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'GAUC'],
    'topk': [10, 20, 50], 
    'valid_metric': 'Recall@50', # for early stopping
    'eval_batch_size': 4096, # 4096
    'metric_decimal_place': 4
    
}

# convert parameter_dict to yaml file
with open(r'config/fixed_config_general.yaml', 'w') as file:
    documents = yaml.dump(parameter_dict, file)

In [4]:
parameter_dict = {
        
    # environment
    'seed': SEED,
    'reproducibility': True,
    'data_path': 'dataset/collections/',
    'checkpoint_dir': 'saved/',
    'show_progress': True,
    'save_dataset': False,
    'log_wandb': True,
    'save_dataloaders': False,
    
    # data
    'field_separator': '\t',
    'seq_separator': ' ',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'load_col' : {'inter': ['user_id', 'item_id'],
                  'item': ['item_id', 'img', 'txt', 'price', 'txn'],
                  'user': ['user_id', 'num_txn', 'avg_price', 'hold_period']},
    
    # training
    'epochs': 50,
    'train_batch_size': 2048, # 2048
    'learner': 'adam',
    'learning_rate': 0.001, # 0.001
    'train_neg_sample_args': {'distribution': 'popularity',
                              'sample_num': 5,
                              'dynamic': False,
                              'candidate_num': 0},
    'eval_step': 1,
    'stopping_step': 3000, # 15
    'loss_decimal_place': 4,
    
    # evaluation
    'eval_args': {'group_by': 'user',
                #   'order': 'MY',
                #   'split': {'MY':'dataset/collections/azuki/split_indices.pkl'},
                  'mode': 'pop100'},
    'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'GAUC'],
    'topk': [10, 20, 50], 
    'valid_metric': 'Recall@50', # for early stopping
    'eval_batch_size': 4096, # 4096
    'metric_decimal_place': 4
    
}

# convert parameter_dict to yaml file
with open(r'config/fixed_config_context.yaml', 'w') as file:
    documents = yaml.dump(parameter_dict, file)

# Example: Run models

In [6]:
from logging import getLogger
from recbole.config import Config
from recbole.utils import init_seed, init_logger
from recbole.data import create_dataset, data_preparation
from recbole.utils import get_model, get_trainer
# from recbole.trainer import HyperTuning
# from recbole.quick_start import objective_function
from recbole.data.dataset import Dataset

In [7]:
MODEL_names = ['FM'] # ['BPR', 'DMF', 'NeuMF', 'NGCF', 'LightGCN'] ['FM', 'NFM', 'DeepFM', 'AFM', 'WideDeep', 'AutoInt', 'DCN']
DATASET_names = ['bayc']

In [8]:
if __name__ == '__main__':
    
    for MODEL in tqdm(MODEL_names):
        test_result_list = []
        for DATASET in DATASET_names:
            
            config = Config(model=MODEL, dataset=DATASET, config_file_list=['config/fixed_config_context.yaml'])
            
            # init random seed
            init_seed(config['seed'], config['reproducibility'])

            # logger initialization
            init_logger(config)
            logger = getLogger()

            # dataset creating and filtering # convert atomic files -> Dataset
            dataset = create_dataset(config)
            logger.info(dataset) # print dataset info

            # dataset splitting # convert Dataset -> Dataloader
            train_data, valid_data, test_data = data_preparation(config, dataset)

            # model loading and initialization
            model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
            logger.info(model)

            # trainer loading and initialization
            trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
            

            """ (1) training """
            
            # model training
            best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)


            """ (2) testing """

            # When calculate ItemCoverage metrics, we need to run this code for set item_nums in eval_collector.
            trainer.eval_collector.data_collect(train_data)

            # model evaluation
            test_result = trainer.evaluate(test_data)
            print('FINAL TEST RESULT')
            print(test_result)
            test_result_list.append(pd.DataFrame.from_dict(test_result, orient='index', columns=[DATASET]))
                

  0%|          | 0/1 [00:01<?, ?it/s]


AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

# Example: HPO

In [20]:
def objective_function(config_dict=None, config_file_list=None):
    
    config = Config(model=MODEL, dataset=DATASET, config_dict=config_dict, config_file_list=config_file_list)
    init_seed(config['seed'], config['reproducibility'])
    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)
    model_name = config['model']
    model = get_model(model_name)(config, train_data.dataset).to(config['device'])
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)
    """ (1) training """
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False)
    """ (2) testing """
    test_result = trainer.evaluate(test_data)

    return {
        'model': model_name,
        'best_valid_score': best_valid_score,
        'valid_score_bigger': config['valid_metric_bigger'],
        'best_valid_result': best_valid_result,
        'test_result': test_result
    }

In [21]:
MODEL_names = ['NGCF']
DATASET_names = ['meebits']
ITEM_CUT_list = [3]

result_path = './result/'
# create folder result_path
if not os.path.exists(result_path):
    os.makedirs(result_path)

In [1]:
for MODEL in MODEL_names:
    for DATASET in tqdm(DATASET_names):
        HPO_test_result_list = []
        for ITEM_CUT in ITEM_CUT_list:
            
            hp = HyperTuning(objective_function=objective_function, algo='exhaustive', 
                                max_evals=50, params_file=f'hyper/{MODEL}.hyper', fixed_config_file_list=['config/fixed_config_baseline.yaml'])

            # run
            hp.run()
            # export result to the file
            hp.export_result(output_file=f'hyper/{MODEL}_{DATASET}_{ITEM_CUT}.result')
            # print best parameters
            print('best params: ', hp.best_params)
            # save best parameters
            with open(f'hyper/{MODEL}_{DATASET}_{ITEM_CUT}.best_params', 'w') as file:
                documents = yaml.dump(hp.best_params, file)
            # print best result
            best_result = hp.params2result[hp.params2str(hp.best_params)]
            print('best result: ')
            print(best_result)
            
            HPO_test_result_list.append(pd.DataFrame.from_dict(best_result['test_result'], orient='index', columns=[f'{DATASET}_{ITEM_CUT}'])) 
        
        pd.concat(HPO_test_result_list, axis=1).to_csv(result_path + f'{MODEL}_{DATASET}_{ITEM_CUT}.csv', index=True)

NameError: name 'MODEL_names' is not defined