In [1]:
import argparse
from audioop import rms
import logging
import os
import sys
from itertools import product
from time import localtime, sleep, strftime, time
import time

import numpy as np
import setproctitle # to set the name of process
import torch
import torch.utils
from tensorboardX import SummaryWriter 
from torch import multiprocessing as mp # 多线程工作

from dataset import get_data_queue_cf, get_data_queue_cf_nonsparse, get_data_queue_efficiently, get_data_queue_negsampling_efficiently
from models import (CML, DELF, DMF, FISM, GMF, MLP, SVD, JNCF_Cat, JNCF_Dot, SVD_plus_plus, SPACE, BaseModel, Virtue_CF)
from controller import sample_arch_cf, sample_arch_cf_signal, sample_arch_cf_test
from train_eval import (evaluate_cf, evaluate_cf_efficiently, evaluate_cf_efficiently_implicit, get_arch_performance_cf_signal_param_device, get_arch_performance_single_device, train_single_cf, train_single_cf_efficiently,get_arch_performance_implicit_single_device)
from sklearn.utils import shuffle

import GPUtil
import socket
import math
import scipy.sparse as sp


%load_ext autoreload
%autoreload 2

In [2]:
parser = argparse.ArgumentParser(description="Run.")
parser.add_argument('--lr', type=float, default=0.05, help='init learning rate')
parser.add_argument('--arch_lr', type=float, default=0.05, help='learning rate for arch encoding')
parser.add_argument('--controller_lr', type=float, default=1e-1, help='learning rate for controller')
parser.add_argument('--weight_decay', type=float, default=1e-5, help='weight decay')
parser.add_argument('--update_freq', type=int, default=1, help='frequency of updating architeture')
parser.add_argument('--opt', type=str, default='Adagrad', help='choice of opt')
parser.add_argument('--use_gpu', type=int, default=1, help='whether use gpu')
parser.add_argument('--minibatch', type=int, default=1, help='whether use minibatch')
parser.add_argument('--gpu', type=int, default=0, help='gpu device id')
parser.add_argument('--train_epochs', type=int, default=2000, help='num of training epochs')
parser.add_argument('--search_epochs', type=int, default=1000, help='num of searching epochs')
parser.add_argument('--save', type=str, default='save/', help='experiment name')
parser.add_argument('--seed', type=int, default=1, help='random seed')
parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping')
parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data')
parser.add_argument('--valid_portion', type=float, default=0.25, help='portion of validation data')
parser.add_argument('--dataset', type=str, default='ml-100k', help='dataset')
parser.add_argument('--mode', type=str, default='random_single', help='search or single mode')
parser.add_argument('--process_name', type=str, default='AutoCF@wenyan', help='process name')
parser.add_argument('--embedding_dim', type=int, default=2, help='dimension of embedding')
parser.add_argument('--controller', type=str, default='PURE', help='structure of controller')
parser.add_argument('--controller_batch_size', type=int, default=4, help='batch size for updating controller')
parser.add_argument('--unrolled', action='store_true', default=True, help='use one-step unrolled validation loss')
parser.add_argument('--max_batch', type=int, default=65536, help='max batch during training')
parser.add_argument('--device', type=int, default=0, help='GPU device')
parser.add_argument('--multi', type=int, default=0, help='using multi-training for single architecture')
parser.add_argument('--if_valid', type=int, default=1, help='use validation set for tuning single architecture or not')
parser.add_argument('--breakpoint', type=str, default='save/log.txt', help='the log file storing existing results')
parser.add_argument('--arch_file', type=str, default='src/arch.txt', help='all arches')
parser.add_argument('--remaining_arches', type=str, default='src/arch.txt', help='')
parser.add_argument('--arch_assign', type=str, default='[0,3]', help='')
parser.add_argument('--data_type', type=str, default='implicit', help='explicit or implicit(default)')
parser.add_argument('--loss_func', type=str, default='bprloss', help='Implicit loss function')
parser.add_argument('--mark', type=str, default='') # 

args = parser.parse_args([])
mp.set_start_method('spawn', force=True) # 一种多任务运行方法

In [10]:
data_start = time.time()
dim = 2
data_path = args.dataset + '/'

# setting datasets,  default='ml-100k'
if args.dataset == 'ml-100k': # default
    num_users = 943
    num_items = 1682
elif args.dataset == 'ml-1m':
    num_users = 6040
    num_items = 3952
elif args.dataset == 'ml-10m':
    num_users = 71567
    num_items = 65133
    # num_users = 715
    # num_items = 653
elif args.dataset == 'ml-20m':
    num_users = 138493
    num_items = 131262
elif args.dataset == 'youtube_small':
    num_ps = 600
    num_qs = 14340
    num_rs = 5
    dim = 3
elif args.dataset == 'youtube':
    num_ps = 15088
    num_qs = 15088
    num_rs = 5
    dim = 3
elif args.dataset == 'amazon-book':
    num_users = 11899
    num_items = 16196
elif args.dataset == 'yelp':
    # num_users = 26829
    # num_items = 20344
    # 31668, item_set: 1237259
    num_users = 31668 
    num_items = 38048
elif args.dataset == 'yelp2':
    num_users = 15496
    num_items = 12666
# yelp generated from: https://www.kaggle.com/yelp-dataset/yelp-dataset
elif args.dataset == 'yelp-10k':
    num_users = 9357-1
    num_items = 4299
elif args.dataset == 'yelp-50k':
    num_users = 42919-1
    num_items = 9033
elif args.dataset == 'yelp-100k':
    num_users = 80388-1
    num_items = 11223
elif args.dataset == 'yelp-1m':
    num_users = 551747-1
    num_items = 28085
elif args.dataset == 'yelp-10m':
    num_users = 1483546-1
    num_items = 90315
elif args.dataset == 'yelp-all':
    num_users = 1483546-1
    num_items = 90315
else:
    pass
args.num_users = num_users
args.num_items = num_items

if args.data_type == 'implicit': # 主要使用这一行，隐式推荐
    train_queue_pair, valid_queue, test_queue = get_data_queue_negsampling_efficiently(data_path, args)
else: # train queue，显式推荐
    train_queue, valid_queue, test_queue = get_data_queue_efficiently(data_path, args)
# print(train_queue)
# logging.logging.info('prepare data finish! [%f]' % (time()-data_start))
stored_arches = {} # log ging表示添加到记录中

In [11]:
args

Namespace(arch_assign='[0,3]', arch_file='src/arch.txt', arch_lr=0.05, breakpoint='save/log.txt', controller='PURE', controller_batch_size=4, controller_lr=0.1, data_type='implicit', dataset='ml-100k', device=0, embedding_dim=2, gpu=0, grad_clip=5, if_valid=1, loss_func='bprloss', lr=0.05, mark='', max_batch=65536, minibatch=1, mode='random_single', multi=0, num_items=1682, num_users=943, opt='Adagrad', process_name='AutoCF@wenyan', remaining_arches='src/arch.txt', save='save/', search_epochs=1000, seed=1, train_epochs=2000, train_portion=0.5, unrolled=True, update_freq=1, use_gpu=1, valid_portion=0.25, weight_decay=1e-05)

In [23]:
search_start = time.time()
performance = {}
best_arch, best_rmse = None, 100000
arch_batch_size = 1  
args.search_epochs = min(args.search_epochs, SPACE) # SPACE=135

remaining_arches_encoding = open(args.remaining_arches, 'r').readlines() # open the file of arch: remaining_arches='src/arch.txt'
remaining_arches_encoding = list(map(lambda x: x.strip(), remaining_arches_encoding))
args.arch_assign = '[0,3]'
if not args.arch_assign:
    # not played
    remaining_arches_encoding = remaining_arches_encoding
else:
    print("args.arch_assign: {}".format(args.arch_assign))
    start, end = eval(args.arch_assign)
    remaining_arches_encoding = remaining_arches_encoding[start:end]
arch_count = 0
print("remaining_arches_encoding: {}".format(remaining_arches_encoding))

args.arch_assign: [0,3]
remaining_arches_encoding: ['ri_mat_mat_max_i', 'rr_mlp_mlp_concat_i', 'ri_mlp_mat_min_mlp']


In [24]:
if arch_count >= len(remaining_arches_encoding):
    # break
    pass
# sample an arch
arch_encoding = remaining_arches_encoding[arch_count] # 对应：u/i.enc_u.emb_i.emb_interaction_pre
arch_single = sample_arch_cf()
# u/i.enc, u.emb, i.emb, interaction, pre
arch_single['cf'], arch_single['emb']['u'], arch_single['emb']['i'], arch_single['ifc'], arch_single['pred'] = arch_encoding.split('_')
arch_count += 1
performance[str(arch_single)] = 0
if len(performance) >= len(remaining_arches_encoding):
    # break
    pass

In [32]:
arch_start = time.time()
avaliable_device_ids = [0,1,2,3]
hostname = socket.gethostname()
print("hostname: {}".format(hostname))
if hostname == 'rl3':
    avaliable_device_ids = [0, 1, 2, 3]
elif hostname == 'fib-dl':
    avaliable_device_ids = [0, 1, 2, 3]
elif hostname == 'fib-dl3':
    avaliable_device_ids = [2,3,5]
elif hostname == 'fib':
    avaliable_device_ids = [0, 1, 2, 3]
elif hostname =='rl2':
    avaliable_device_ids = [0, 1, 2, 3]
elif hostname == 'abc':
    # avaliable_device_ids = [4,5,6,7]
    avaliable_device_ids = [0,1,2,3,4,5,6,7]
else:
    pass

lr_candidates = [0.01, 0.02, 0.05, 0.1]
rank_candidates = [2, 4, 8, 16]
hyper_parameters = list(product(lr_candidates, rank_candidates))
run_one_model = 0

hostname: abc


In [38]:
avaliable_device_ids = GPUtil.getAvailable(order = 'first', limit = 8, maxLoad = 0.5, maxMemory = 0.2, includeNan=False, excludeID=[], excludeUUID=[])
if hostname == 'fib':
    avaliable_device_ids = [0, 1, 2, 3]
elif hostname == 'abc':
    # avaliable_device_ids = [4,5,6,7]
    avaliable_device_ids = [0,1,2,3]
    # avaliable_device_ids = [0,1,2,3,4,5,6,7]
else:
    pass
avaliable_device_ids = GPUtil.getAvailable(order = 'first', limit = 8, maxLoad = 0.5, maxMemory = 0.2, includeNan=False, excludeID=[], excludeUUID=[])
print(avaliable_device_ids)
assigned_device_ids = avaliable_device_ids
if run_one_model > 0:
    # break
    pass
task_number = math.ceil(16 / len(avaliable_device_ids)) 
print("task_number per GPU: {}".format(task_number))
task_split = list(range(0, 16, len(avaliable_device_ids)))
task_split.append(16)
print("task_split: {}".format(task_split))
task_index = [list(range(task_split[i], task_split[i+1])) for i in range(task_number)]

[0, 1, 2, 3, 4]
task_number per GPU: 4
task_split: [0, 5, 10, 15, 16]


In [51]:
for tasks in task_index:
    print("tasks: {}".format(tasks))
    with mp.Pool(processes=len(tasks)) as p:
        print('Stage1, p: {}'.format(p))
        p.name = 'test'
        if args.data_type == 'implicit':# 装载
            jobs = [[arch_single, num_users, num_items, [], valid_queue, test_queue, train_queue_pair, args, hyper_parameters[i], assigned_device_ids[i % len(assigned_device_ids)], args.if_valid] for i in tasks]
        else:
            jobs = [[arch_single, num_users, num_items, train_queue, valid_queue, test_queue, args, hyper_parameters[i], assigned_device_ids[i % len(assigned_device_ids)], args.if_valid] for i in tasks]
        # rmse_list1 = p.map(get_single_model_performance, jobs)
        print(len(jobs))
        # print(jobs[0])
        run_one_model += 1
        p.close()
# 此处循环有 16 个


tasks: [0, 1, 2, 3, 4]
Stage1, p: <multiprocessing.pool.Pool state=RUN pool_size=5>
5
[{'cf': 'ri', 'emb': {'u': 'mat', 'i': 'mat'}, 'ifc': 'max', 'pred': 'i'}, 943, 1682, [], [tensor([373, 221, 863,  ...,  57, 790, 129]), tensor([264,   3, 482,  ..., 474, 287, 251]), tensor([5., 3., 5.,  ..., 5., 3., 5.]), tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]]), tensor([[0., 0., 0.,  ..., 5., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 5.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])], [tensor([  0,   0,   0,  ..., 942, 942, 942]), tensor([ 42, 128,  89,  ..., 187, 228, 126]), tensor([[5., 0., 0.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
       

In [52]:
task_index

[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15]]

In [36]:
print(arch_encoding, arch_single)
print(performance)
print(hyper_parameters)
task_index

ri_mat_mat_max_i {'cf': 'ri', 'emb': {'u': 'mat', 'i': 'mat'}, 'ifc': 'max', 'pred': 'i'}
{"{'cf': 'ri', 'emb': {'u': 'mat', 'i': 'mat'}, 'ifc': 'max', 'pred': 'i'}": 0}
[(0.01, 2), (0.01, 4), (0.01, 8), (0.01, 16), (0.02, 2), (0.02, 4), (0.02, 8), (0.02, 16), (0.05, 2), (0.05, 4), (0.05, 8), (0.05, 16), (0.1, 2), (0.1, 4), (0.1, 8), (0.1, 16)]


[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15]]