In [1]:
import torch
import torch.optim as optim
import pickle
from library_models import *
from library_data import *
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [4]:
class Fakeargs:
    pass
args = Fakeargs()
args.model = "Jodie"
args.network = "public_dataset_ready_reduced"
args.gpu = -1
args.embedding_dim = 8
args.train_proportion = 0.8
args.state_change = True
args.epoch = 3
args.datapath = "data/%s.csv" % args.network

In [5]:
# LOAD NETWORK
[user2id, user_sequence_id, user_timediffs_sequence, user_previous_itemid_sequence, \
 item2id, item_sequence_id, item_timediffs_sequence, \
 timestamp_sequence, \
 feature_sequence, \
 y_true] = load_network(args)
num_interactions = len(user_sequence_id)
num_features = len(feature_sequence[0])
num_users = len(user2id)
num_items = len(item2id) + 1
true_labels_ratio = len(y_true)/(sum(y_true)+1)
print("*** Network statistics:\n  %d users\n  %d items\n  %d interactions\n  %d/%d true labels ***\n\n" % (num_users, num_items, num_interactions, sum(y_true), len(y_true)))

#define parameters of training
train_proportion = 0.8

# SET TRAIN, VALIDATION, AND TEST BOUNDARIES
train_end_idx = validation_start_idx = int(num_interactions * args.train_proportion)
test_start_idx = int(num_interactions * (args.train_proportion + 0.1))
test_end_idx = int(num_interactions * (args.train_proportion + 0.2))

# SET BATCHING TIMESPAN
'''
Timespan indicates how frequently the model is run and updated.
All interactions in one timespan are processed simultaneously.
Longer timespans mean more interactions are processed and the training time is reduced, however it requires more GPU memory.
At the end of each timespan, the model is updated as well. So, longer timespan means less frequent model updates.
'''
timespan = timestamp_sequence[-1] - timestamp_sequence[0]
tbatch_timespan = timespan / 500

# INITIALIZE MODEL PARAMETERS
model = JODIE(args, num_features, num_users, num_items).cuda()
weight = torch.Tensor([1,true_labels_ratio]).cuda()
crossEntropyLoss = nn.CrossEntropyLoss(weight=weight)
MSELoss = nn.MSELoss()

# INITIALIZE MODEL
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# LOAD THE MODEL
model, optimizer, user_embeddings_dystat, item_embeddings_dystat, user_embeddings_timeseries, item_embeddings_timeseries, train_end_idx_training = load_model(model, optimizer, args, args.epoch)



**** Loading public_dataset_ready_reduced network from file: data/public_dataset_ready_reduced.csv ****
Formating item sequence
Formating user sequence
Scaling timestamps
*** Network loading completed ***


*** Network statistics:
  39 users
  43 items
  253973 interactions
  22215/253973 true labels ***


*** Initializing the JODIE model ***
Initializing user and item embeddings
Initializing user and item RNNs
Initializing linear layers
*** JODIE initialization complete ***


Loading saved embeddings and model: ./saved_models/public_dataset_ready_reduced/checkpoint.Jodie.ep3.tp0.8.pth.tar


In [28]:
#pick 5 random users and track their dynamic embeddings
random_users =  np.random.randint(0,max(user_sequence_id),size=8) #np.array([10,13,16,21,26])#
user_labels = {user:np.array(y_true)[np.where(user_sequence_id==user)[0]] for user in random_users} #get labels of each interaction for this user
print([f"user: {user}: {sum(user_labels[user])}/{len(user_labels[user])}" for user in user_labels.keys()])
user_embeddings={user:user_embeddings_timeseries[np.where(user_sequence_id==user)[0],:] for user in random_users} #get series of user embeddings

['user: 32: 0/53', 'user: 18: 0/54', 'user: 29: 0/65', 'user: 24: 0/75', 'user: 1: 0/20912', 'user: 12: 3293/4782', 'user: 6: 0/324', 'user: 16: 0/582']


In [29]:
#Create deltas for user embedding sequences (euclidian distances between embeddings)
deltas_dict = {}
for user,embeddings in user_embeddings.items():
    delta_series = np.array(embeddings.shape[0]-1,dtype=float)
    embeddings_shifted = np.copy(embeddings.cpu())[1:,:]
    deltas = np.linalg.norm(embeddings.cpu()[:-1,:]-embeddings_shifted, axis=1)
    deltas_dict[user] = deltas


In [30]:
with open("data/reduced_dataset_attack_labels","rb") as fp:
    attack_labels = pickle.load(fp) #load attack category labels for this dataset
cats = {}
for user in random_users:
    cats[user]=[attack_labels[i] for i in np.where(user_sequence_id==user)[0]]

In [31]:
metada = []
for user,embeddings in user_embeddings.items(): #prepare metada for tensorboard
    for i in range(embeddings.shape[0]):
        metada.append(f"user_{user}_{i}_label_{user_labels[user][i]}_{cats[user][i]}\n")

In [32]:
embeddings = torch.cat([item for key,item in user_embeddings.items()], dim=0)

In [33]:
writer = SummaryWriter()
writer.add_embedding(embeddings,metadata=metada)
for user,deltas in deltas_dict.items():
    for i in range(deltas.shape[0]):
        writer.add_scalar(str(user), deltas[i],i)

In [34]:
writer.close()