https://github.com/pcerdam/KerasGRU4Rec/blob/master/model/gru4rec.py

In [3]:
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True #allow_growth 옵션 : 런타임 할당에 따라 GPU 메모리 할당하려고 시도하는 옵션.
# 처음에 메모리를 거의 할당하지 않고 세션을 시작한 후, 그 후에 더 많은 GPU 메모리가 필요할 때 TF필요한 GPU 메모리 영역을 확장한다.

import keras
import keras.backend as K
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding

In [7]:
class SessionDataset :
    """Credit to yhs-968/pyGRU4REC."""
    
    def __init__(self, data, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time',
                n_samples = -1, itemmap = None, time_sort = False) :
        
        """
        Args:
            path: path of the data file
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_sort = time_sort
        self.add_time_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace = True)
        
        # new method
        self.click_offsets = self.get_click_offsets() #각 sessionID 별 시작 index
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets() :
        
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype = np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        
        return offsets
    
    def order_session_idx(self) :
        """order the session indices"""
        
        if self.time_sort :
            # 각 세션별 시작 타임의 최솟값
            session_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # 최솟값들 중에서 작은 것들 순서
            session_idx_arr = np.argsort(session_start_time)
        
        else :
            #0~
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
            
        return session_idx_arr
    
    def add_time_indices(self, itemmap = None) :
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        
        if itemmap is None :
            item_ids = self.df[self.item_key].unique() #unique item ids
            item2idx = pd.Series(data = np.arange(len(item_ids)),
                                index = item_ids) # make index
            itemmap = pd.DataFrame({self.item_key : item_ids,
                                   'item_idx' : item2idx[item_ids].values})
            
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on = self.item_key, how = 'inner')
        
    @property
    def items(self) :
        return self.itemmap.ItemId.unique()

In [8]:
class SessionDataLoader :
    """Credit to yhs-968/pyGRU4REC."""
    def __init__(self, dataset, batch_size = 50) :
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0
        
        
    def __iter__(self) :
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """
        
        df = self.dataset.df #???????
        session_key = 'SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr
        
        iters = np.arange(self.batch_size) #batch size
        maxiter = iters.max()
        
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []
        
        finished = False
        
        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                inp = idx_input
                target = idx_target
                yield inp, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]
        

In [10]:
def create_model() :
    emb_size = 50
    hidden_units = 100
    size = emb_size
    
    inputs = Input(batch_shape=(args.batch_size, 1, args.train_n_items)) #len(args.train_data['ItemId'].unique()) + 1
    #input : batch_size, 1, ItemId unique
    
    gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs) #Fast GRU implementation backed by cuDNN
    #stateful :  If True, the last state for each sample at index i in a batch will be used as initial state for the sample of index i in the following batch.
    #return_state :  Whether to return the last state in addition to the output.
    
    drop2 = Dropout(0.25)(gru)
    predictions = Dense(args.train_n_items, activation = 'softmax')(drop2)
    opt = keras.optimizers.Adam(lr = 0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay = 0, amsgrad = False)
    
    #loss function
    model.compile(loss = categorical_crossentropy, optimizer = opt)
    model.summary()
    
    filepath = './model_checkpint.h5'
    checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 2, save_best_only=True, mode = 'min')
    #Save the model after every epoch
    #save_best_only : the latest best model according tothe quantity monitored will not be overwritten
    #mode : save min model
    
    return model

In [12]:
def get_state(model) :
    # get_value : Returns the value of a variable.
    # state_updates
    return [K.get_value(s) for s, _ in model.state_updates]

In [15]:
def get_metrics(model, args, train_generator_map, recall_k = 20, mrr_k = 20) :
    
    test_dataset = SessionDataset(args.test_data, itemmap=train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=args.batch_size)
    
    n = 0
    rec_sum = 0; mrr_sum = 0
    
    with tqdm(total = args.test_samples_qty) as pbar :
        for feat, label, mask in test_generator :
            
            # to_categorical : Converts a class vector (integers) to binary class matrix. (one-hot)
            # - num_classes: total number of classes(데이터수 X 클래스 수)
            target_oh = to_categorical(label, num_classes = args.train_n_items)
            input_oh = to_categorical(feat, num_classes = args.train_n_items)
            input_oh = np.expand_dims(input_oh, axis = 1)
            
            pred = model.predict(input_oh, batch_size = args.batch_size)
            
            for row_idx in rnage(feat.shape[0]) :
                pred_row = pred[row_idx]
                label_row = target_oh[row_idx]
                
                rec_idx = pred_row.argsort()[-recall_k:][::-1]
                mrr_idx = pred_row.argsort()[-mrr_k:][::-1]
                tru_idx = label_row.argsort()[-1:][::-1]
                
                n += 1
                
                if tru_idx[0] in rec_idx :
                    rec_sum += 1
                    
                if tru_idx[0] in mrr_idx :
                    mrr_sum += 1/int((np.where(mrr_idx == tru_idx[0])[0] + 1))
                    
            pbar.set_description('Evaluation model')
            pbar.update(test_generator.done_sessions_counter)
            
    recall = rec_sum / n
    mrr = mrr_sum / n
    
    return (recall, recall_k), (mrr, mrr_k)

In [16]:
def train_model(model, args, save_weights = False) :
    train_dataset = SessionDataset(args.train_data)
    model_to_train = model
    batch_size = args.batch_size
    
    for epoch in range(1, 10):
        with tqdm(total=args.train_samples_qty) as pbar:
            loader = SessionDataLoader(train_dataset, batch_size=batch_size)
            for feat, target, mask in loader:
                
                real_mask = np.ones((batch_size, 1))
                for elt in mask:
                    real_mask[elt, :] = 0

                hidden_states = get_states(model_to_train)[0]
                hidden_states = np.multiply(real_mask, hidden_states)
                hidden_states = np.array(hidden_states, dtype=np.float32)
                model_to_train.layers[1].reset_states(hidden_states)

                input_oh = to_categorical(feat, num_classes=loader.n_items) 
                input_oh = np.expand_dims(input_oh, axis=1)

                target_oh = to_categorical(target, num_classes=loader.n_items)

                tr_loss = model_to_train.train_on_batch(input_oh, target_oh)

                pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                pbar.update(loader.done_sessions_counter)
            
        if save_weights:
            print("Saving weights...")
            model_to_train.save('./GRU4REC_{}.h5'.format(epoch))
        
        (rec, rec_k), (mrr, mrr_k) = get_metrics(model_to_train, args, train_dataset.itemmap)

        print("\t - Recall@{} epoch {}: {:5f}".format(rec_k, epoch, rec))
        print("\t - MRR@{}    epoch {}: {:5f}".format(mrr_k, epoch, mrr))
        print("\n")

In [23]:
os.listdir('./data/preprocessed/')

['rsc15_train_tr.pkl',
 'rsc15_test.pkl',
 'rsc15_train_full.pkl',
 'rsc15_train_valid.txt']

# main

In [29]:
train_path = './data/preprocessed/rsc15_train_tr.pkl'
dev_path = './data/preprocessed/rsc15_train_valid.pkl'
test_path = './data/preprocessed/rsc15_test.pkl'
batch_size = 512

https://worthpreading.tistory.com/56

In [41]:
parser = argparse.ArgumentParser(description='Keras GRU4REC: session-based recommendations')
parser.add_argument('--resume', type=str, help='stored model path to continue training')
parser.add_argument('--train-path', type=str, default='./data/preprocessed/rsc15_train_tr.pkl')
parser.add_argument('--dev-path', type=str, default='./data/preprocessed/rsc15_train_valid.pkl')
parser.add_argument('--test-path', type=str, default='./data/preprocessed/rsc15_test.pkl')
parser.add_argument('--batch-size', type=str, default=512)

_StoreAction(option_strings=['--batch-size'], dest='batch_size', nargs=None, const=None, default=512, type=<class 'str'>, choices=None, help=None, metavar=None)

In [42]:
args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--resume RESUME] [--train-path TRAIN_PATH]
                             [--dev-path DEV_PATH] [--test-path TEST_PATH]
                             [--batch-size BATCH_SIZE]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1005/jupyter/kernel-1d23836a-d7a7-456a-8beb-53d20179f9ac.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [34]:
train_data = pd.read_pickle(train_path)
dev_data   = pd.read_pickle(dev_path)
test_data  = pd.read_pickle(test_path)

In [37]:
train_data.head()

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0
3,1,214577561,1396836000.0
4,2,214662742,1396847000.0


In [38]:
train_n_items = len(train_data['ItemId'].unique()) + 1

train_samples_qty = len(train_data['SessionId'].unique()) + 1
test_samples_qty = len(test_data['SessionId'].unique()) + 1

print('train data SessionID unique + 1', train_samples_qty)
print('tetst data SessionID unique + 1', test_samples_qty)

train data SessionID unique + 1 7953886
tetst data SessionID unique + 1 15325


In [40]:
#if resume training data -> resume = True
resume = False

if resume:
    try:
        model = keras.models.load_model(resume)
        print("Model checkpoint '{}' loaded!".format(resume))
    except OSError:
        print("Model checkpoint could not be loaded. Training from scratch...")
        model = create_model()
else:
    model = create_model()

TypeError: create_model() missing 1 required positional argument: 'args'

In [None]:
train_model(model, args, save_weights=True)