In [1]:
import tensorflow as tf
import numpy as np
import random
from sklearn.model_selection import KFold
from load_data import load_toy, load_blogcatalog
from keras.layers import Dense, Dropout
from copy import deepcopy
from tensorflow.keras.models import Sequential
import sys

2024-01-03 18:39:35.470664: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:

dataset_name = "BlogCatalog"
data_dir = "../Data/" + dataset_name

total_graph = load_blogcatalog(data_dir)
#total_graph = load_toy(data_dir)


In [3]:

def get_similarity(total_graph):
    """Construct a dict of similar nodes, i.e. ones that are within distance 3 of each other"""
    similarity_dict = {i:set() for i in range(total_graph['N_nodes'])}
    for i in range(total_graph['N_nodes']):
        if i%1000==0:
            print(i)
        nb = total_graph['edges'][i]
        similarity_dict[i].update(nb)
        for n in nb:
            new_neighbors = total_graph['edges'][n]
            similarity_dict[i].update(new_neighbors)
            #for j in new_neighbors:
            #    similarity_dict[i].update(total_graph['edges'][j])       
    return similarity_dict

sim_dict = get_similarity(total_graph)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [11]:
def get_node_features(total_graph, mode="degree"):
    if mode=="degree":
        node_features = np.zeros((total_graph['N_nodes'],2))
        for i in range(total_graph['N_nodes']):
            neighbors = total_graph['edges'][i]
            degree = len(neighbors)
            node_features[i,0] = degree
            second_neighbors_count = 0
            for n in neighbors:
                second_neighbors_count += len(total_graph['edges'][n])
            node_features[i,1] = second_neighbors_count
    elif mode=="degree_dist":
        node_features = total_graph['adj_matrix']  
    elif mode=="node_nr":
        node_features = np.zeros((total_graph['N_nodes'],total_graph['N_nodes']))
        for i in range(total_graph['N_nodes']):
            node_features[i,i] = 1
    return node_features


def concat(a, b):
    return tf.concat([a, b], axis=0)



def get_positive_samples(batch_nodes, edge_dict):
    """perform a random walk of length 5"""
    pos_samples = [0]*len(batch_nodes)
    for i,node in enumerate(batch_nodes):
        v = node
        for w in range(5):
            v = random.choice(edge_dict[v])
        pos_samples[i] = v
    return pos_samples


def get_negative_samples(total_graph, batch_nodes, Q):
    batch_size = len(batch_nodes)
    neg_ind = np.zeros((batch_size, Q), dtype=int)     
    for j in range(Q):
        for i, v in enumerate(batch_nodes):
            found_negative_edge = False
            while not found_negative_edge:
                node = random.choice(batch_nodes)
                if total_graph['adj_matrix'][node, v]==0 and total_graph['adj_matrix'][v, node]==0:
                    neg_ind[i, j] = node
                    found_negative_edge = True
    return neg_ind


def compute_neighborhoods(edge_dict, N_nodes, nb_size):
    neighborhoods = [[] for i in range(N_nodes)]
    for v, neighbors in edge_dict.items():
        nb = len(neighbors)
        sample_size = tf.minimum(nb_size, nb)
        if sample_size==1:
            sample_neighborhood = [edge_dict[v][0]]
        else:
            neighborhood_ind = tf.random.uniform((sample_size,), maxval=nb-1, dtype=tf.int32)
            sample_neighborhood = tf.gather(neighbors, neighborhood_ind).numpy().tolist()
        neighborhoods[v] = sample_neighborhood
    return neighborhoods


def batch_forward(node_features, batch_nodes, K, model, neighborhoods):
    eps = 1e-9
    B = [[] for k in range(K)]
    B[K-1] = batch_nodes[:]
    for k in range(K-1, 0, -1):
        B[k-1] = B[k][:]
        for node in B[k][:]:
            B[k-1].extend(neighborhoods[node])

    h = node_features 
    N_nodes = h.shape[0]
    for k in range(K):
        h_updated = tf.TensorArray(dtype=tf.float32, size=N_nodes)
        for v in B[k]:        
            neighborhood = neighborhoods[v]
            hv = tf.reshape(h[v], (1, -1))
            hN = tf.gather(h, neighborhood)
            conc = tf.concat([hN, hv], axis=0)
            aggregated = tf.reduce_mean(conc, axis=0, keepdims=True) #tf.cast(tf.reduce_mean(conc, axis=0, keepdims=True), dtype=tf.float32)
            #layer = model.layers[k]
            #hv = layer(aggregated)
            hv = model(aggregated)
            h_updated = h_updated.write(v, hv)

        h = tf.squeeze(h_updated.stack())
        h_updated.mark_used()
        # Some implementations don't use normalize and relu at final step?
        #if k<K-1:
        #    h = h / (tf.norm(h, axis=1, keepdims=True)+eps)
    # we return entire h, containing 0 for the nodes outside batch
    return h


def compute_loss(Z_tot, Z_pos, Q, batch_nodes, negative_samples_ind):
    Z = tf.gather(Z_tot, batch_nodes)
    dot = tf.reduce_sum(tf.math.multiply(Z, Z_pos), axis=1)
    term1 = -tf.math.log(tf.math.sigmoid(dot))
    term2 = 0
    for i in range(Q):
        Z_neg = tf.gather(Z_tot, negative_samples_ind[:,i])
        term2 -= tf.math.log(tf.math.sigmoid(-tf.reduce_sum(tf.math.multiply(Z, Z_neg), axis=1)))
    #loss = term1 + term2
    return term1, term2


In [12]:

def split_into_random_batches(my_list, N):
    random.shuffle(my_list)  # Shuffle the list in-place
    return [my_list[i:i + N] for i in range(0, len(my_list), N)]


K = 1
def train_embedding(total_graph, epochs):
    node_features_og = get_node_features(total_graph, "degree_dist")    #total_graph['node_feats']
    feature_dim = node_features_og.shape[1]
    output_dim = 64
    fixed_nb_size = 10
    num_epochs = epochs
    batch_size = 256
    Q = 1
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, epsilon=1e-6) 

    # define model
    model = Sequential()
    model.add(tf.keras.Input(shape=(feature_dim,)))
    for k in range(K-1):
        model.add(Dense(units=output_dim, activation='relu', bias_initializer="zeros"))         
    model.add(Dense(units=output_dim, activation='relu', bias_initializer="zeros"))   

    # Build the model by passing an input vector
    model(node_features_og[0].reshape((1,feature_dim)))

    loss_over_epochs = []
    neighborhoods = compute_neighborhoods(total_graph['edges'], total_graph['N_nodes'], fixed_nb_size)
    for epoch in range(num_epochs):   
        batches = split_into_random_batches(total_graph['nodes'], batch_size)
        node_features = deepcopy(node_features_og)
        print(len(batches))
        for i, batch_nodes in enumerate(batches):
            pos_samples = get_positive_samples(batch_nodes, total_graph['edges'])  # has shape (N_nodes,)
            negative_samples_ind = get_negative_samples(total_graph, batch_nodes, Q)  if Q else None  # has shape (N_nodes, Q)  
            Z_pos = tf.gather(batch_forward(node_features, pos_samples, K, model, neighborhoods), pos_samples)
            with tf.GradientTape() as tape:
                # Forward pass
                Z_tot = batch_forward(node_features, batch_nodes, K, model, neighborhoods)
                # Calculate the loss
                t1, t2 = compute_loss(Z_tot, Z_pos, Q, batch_nodes, negative_samples_ind)
                loss = tf.reduce_sum(t1+t2)
                print(i/len(batches), loss, tf.reduce_sum(t1), tf.reduce_sum(t2))

            # Calculate gradients
            gradients = tape.gradient(loss, model.trainable_variables)
        
            # Update model weights
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        loss_over_epochs.append(loss)    
    
    return model, loss_over_epochs, neighborhoods


model, losses, neighborhoods = train_embedding(total_graph, 1)

41
0.0 tf.Tensor(127.121994, shape=(), dtype=float32) tf.Tensor(127.121994, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.024390243902439025 tf.Tensor(96.80124, shape=(), dtype=float32) tf.Tensor(96.80124, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.04878048780487805 tf.Tensor(61.3106, shape=(), dtype=float32) tf.Tensor(61.3106, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.07317073170731707 tf.Tensor(37.44089, shape=(), dtype=float32) tf.Tensor(37.44089, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.0975609756097561 tf.Tensor(21.919987, shape=(), dtype=float32) tf.Tensor(21.919987, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.12195121951219512 tf.Tensor(10.892364, shape=(), dtype=float32) tf.Tensor(10.892364, shape=(), dtype=float32) tf.Tensor(0, shape=(), dtype=int32)
0.14634146341463414 tf.Tensor(10.049408, shape=(), dtype=float32) tf.Tensor(10.049408, shape=(), dtype=float32) tf.Tensor(0, shape=

In [13]:

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":list(nodes[train_index]), "test":list(nodes[test_index])}


In [14]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler


def onehot(y, nclasses):
    Y = np.zeros((y.shape[0], nclasses), dtype=int)
    for i in range(y.shape[0]):
        c = y[i]
        Y[i,c-1] =  1
    return Y


def precision_and_recall(Y_true, Y_pred, nclasses):
    # count true positives and false positives and false negatives
    TP_list = [0]*nclasses
    FP_list = [0]*nclasses
    FN_list = [0]*nclasses
    for j in range(nclasses):
       for i, pred in enumerate(Y_pred):
            if pred[j]==1 and Y_true[i][j]==1:
                TP_list[j] += 1
            elif pred[j]==1 and  Y_true[i][j]==0:
                FP_list[j] += 1
            elif pred[j]==0 and Y_true[i][j]==1:
                FN_list[j] += 1 

    return TP_list, FP_list, FN_list

def compute_f1_macro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    f1_scores = [0]*nclasses
    for k in range(nclasses):
        if TP_list[k]==0:
            continue
        f1_scores[k] = TP_list[k]/(TP_list[k]+0.5*(FP_list[k]+FN_list[k])) 
    return np.sum(f1_scores)/nclasses


def compute_f1_micro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    TP = np.sum(TP_list)
    FP = np.sum(FP_list)
    FN = np.sum(FN_list)
    print(TP, FP, FN)
    return TP/(TP + 0.5*(FN+FP))


N_classes = total_graph['N_classes']
mb = MultiLabelBinarizer(classes=[i for i in range(N_classes)])
f1_macro_list = []
f1_micro_list = []

# 5-fold cross validation
for i in range(5):
    print(i)
    node_features = get_node_features(total_graph, "degree_dist") 
    training_nodes = NC_5folds[i]['train']
    test_nodes = NC_5folds[i]['test']
    Z_train = tf.gather(batch_forward(node_features, training_nodes, K, model, neighborhoods), training_nodes).numpy()
    print("klar med z train")
    Z_test = tf.gather(batch_forward(node_features, test_nodes, K, model, neighborhoods), test_nodes).numpy()
    X_train = Z_train
    X_test = Z_test
    # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
    if not total_graph['Multioutput']:
        Y_train_sequence = np.array([total_graph['groups'][node][0]  for node in training_nodes],dtype=int)
        Y_test_sequence = np.array([total_graph['groups'][node][0] for node in test_nodes], dtype=int)
        log_reg = LogisticRegression(multi_class="ovr", max_iter=200)
        Y_train = Y_train_sequence
        Y_test = Y_test_sequence
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
        Y_pred = onehot(Y_pred, total_graph['N_classes'])
        Y_test = onehot(Y_test, total_graph['N_classes'])
    else:
        print("hej")
        Y_train_sequence = [total_graph['groups'][node]  for node in training_nodes]
        Y_test_sequence = [total_graph['groups'][node] for node in test_nodes]
        Y_train = mb.fit_transform(Y_train_sequence)
        Y_test = mb.fit_transform(Y_test_sequence)
        log_reg = MultiOutputClassifier(SGDClassifier(max_iter=200))   #multi_class="ovr",
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
  
    f1_macro = compute_f1_macro(Y_test, Y_pred, N_classes)
    f1_micro = compute_f1_micro(Y_test, Y_pred, N_classes)

    f1_macro_list.append(f1_macro)
    f1_micro_list.append(f1_micro)
    print(f1_macro, f1_micro)
    sys.exit()
    
print(np.mean(f1_micro_list))
print(np.mean(f1_macro_list))

0
klar med z train
hej
59 7 2869
0.016345352399220553 0.0394121576486306


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
