In [None]:

import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import KFold
from load_data import load_toy, load_blogcatalog
from copy import deepcopy
import sys

import torch_geometric

In [None]:

dataset_name = "BlogCatalog"
data_dir = "../Data/" + dataset_name

total_graph = load_blogcatalog(data_dir)
#total_graph = load_toy(data_dir)



def get_node_features(total_graph, mode="degree"):
    if mode=="degree":
        node_features = np.zeros((total_graph['N_nodes'],2))
        for i in range(total_graph['N_nodes']):
            neighbors = total_graph['edges'][i]
            degree = len(neighbors)
            node_features[i,0] = degree
            second_neighbors_count = 0
            for n in neighbors:
                second_neighbors_count += len(total_graph['edges'][n])
            node_features[i,1] = second_neighbors_count
    elif mode=="degree_dist":
        node_features = total_graph['adj_matrix']  
    elif mode=="node_nr":
        node_features = np.zeros((total_graph['N_nodes'],total_graph['N_nodes']))
        for i in range(total_graph['N_nodes']):
            node_features[i,i] = 1
    return node_features



In [None]:


class GraphSage(torch.nn.Module):
    def __init__(self, input_dim, output_dim, K):
        super(GraphSage, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, output_dim))
        for i in range(K-1):
            self.layers.append(nn.Linear(output_dim, output_dim))
        self.K = K
    
    
    def aggregate_neighbors(self, batch):
        batch_nodes = batch.src_index
        subgraph_edge_indices = batch.edge_index

        neighborhoods = {}
        for key, value in zip(subgraph_edge_indices[0], subgraph_edge_indices[1]):
            if key.item() not in neighborhoods:
                neighborhoods[key.item()] = []
            neighborhoods[key.item()].append(value.item())
        
        B = [[] for k in range(self.K+1)]
        B[-1] = batch_nodes[:].tolist()
        for k in range(self.K, 0, -1):
            B[k-1] = B[k][:]
            for node in B[k]:  
                B[k-1].extend(neighborhoods[node])
        return B, neighborhoods


    def forward(self, batch):
        #need node_features, batch_nodes
        eps = 1e-9
        
        B, neighborhood_dict = self.aggregate_neighbors(batch)
        #print(B)
        #print(neighborhood_dict)
        h = batch.x
        N_nodes = h.shape[0]
        for k in range(self.K):
            layer = self.layers[k]
            h_updated = torch.zeros((N_nodes, layer.out_features))
            for i,v in enumerate(B[k]):        
                neighborhood = neighborhood_dict[v]
                hv = h[v].view(1, -1)
                hN = h[neighborhood]
                conc = torch.cat([hN, hv], dim=0)
                aggregated = torch.mean(conc, dim=0, keepdim=True)
                output = layer(aggregated)
                hv = F.relu(output)
                h_updated[v] =  hv    
            h = h_updated
            h = h / (torch.norm(h, dim=1, keepdim=True)+eps)
        return h


def compute_neighborhoods(edge_dict, N_nodes, nb_size):
    neighborhoods = [[] for _ in range(N_nodes)]
    for v, neighbors in edge_dict.items():
        nb = len(neighbors)
        sample_size = min(nb_size, nb)
        if sample_size == 1:
            sample_neighborhood = [neighbors[0]]*nb_size
        else:
            neighborhood_ind = torch.randint(0, nb, (sample_size,))
            sample_neighborhood = [neighbors[i] for i in neighborhood_ind.tolist()]
        neighborhoods[v] = sample_neighborhood
    return neighborhoods


def compute_loss(Z, Z_pos, Z_neg):
    dot = torch.sum(Z * Z_pos, dim=1)
    term1 = -torch.log(torch.sigmoid(dot))
    term2 = 0
    for q in range(Z_neg.shape[0]):
        term2 = -torch.log(torch.sigmoid(-torch.sum(Z * Z_neg[q,:,:], dim=1)))
    return torch.mean(term1+term2)



In [None]:
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.sampler import NegativeSampling


node_features_np = get_node_features(total_graph, mode="node_nr")
node_features_torch = torch.tensor(node_features_np).float()

node_links_torch = torch.tensor(total_graph['edges_list']).T
graph_data = torch_geometric.data.Data(node_features_torch, node_links_torch)
# parameters
num_epochs = 1
K = 1   # number of iterations
Q = 2
batch_size = 128
input_size = node_features_np.shape[1]
output_dim = 64
nb_size = 10

# Create the model
model = GraphSage(input_size, output_dim, K)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

ns = NegativeSampling(mode="triplet", amount=Q)
for epoch in range(num_epochs):
    #loader = NeighborLoader(graph_data, input_nodes=torch.arange(total_graph['N_nodes']), batch_size=batch_size, num_neighbors=[10]*K, replace=True, subgraph_type="bidirectional")
    loader = LinkNeighborLoader(graph_data, batch_size=batch_size, num_neighbors=[nb_size]*K, neg_sampling=ns, shuffle=True, replace=True, subgraph_type="bidirectional")  
    for i,batch in enumerate(loader):
 
        positive_samples = batch.dst_pos_index.tolist()
        negative_samples = batch.dst_neg_index.numpy()
        Z_tot = model(batch)
        Z = Z_tot[batch.src_index.tolist()]
        Z_pos = Z_tot[positive_samples]
        Z_neg = torch.zeros((Q, batch_size, output_dim))
        for q in range(Q):
            Z_neg[q] = Z_tot[negative_samples[:,q]]
       
        loss = compute_loss(Z, Z_pos, Z_neg)
        print(i/len(loader), loss)
        loss.backward()
        optimizer.step()
    
    node_features_np = get_node_features(total_graph, mode="node_nr")
    node_features_torch = torch.tensor(node_features_np).float()


0.0 tensor(1.4868, grad_fn=<MeanBackward0>)
0.0002994011976047904 tensor(1.5142, grad_fn=<MeanBackward0>)
0.0005988023952095808 tensor(1.5283, grad_fn=<MeanBackward0>)
0.0008982035928143712 tensor(1.4996, grad_fn=<MeanBackward0>)
0.0011976047904191617 tensor(1.4718, grad_fn=<MeanBackward0>)
0.0014970059880239522 tensor(1.4837, grad_fn=<MeanBackward0>)
0.0017964071856287425 tensor(1.4346, grad_fn=<MeanBackward0>)
0.002095808383233533 tensor(1.3952, grad_fn=<MeanBackward0>)
0.0023952095808383233 tensor(1.3987, grad_fn=<MeanBackward0>)


KeyboardInterrupt: 

In [None]:

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":list(nodes[train_index]), "test":list(nodes[test_index])}


In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler


def onehot(y, nclasses):
    Y = np.zeros((y.shape[0], nclasses), dtype=int)
    for i in range(y.shape[0]):
        c = y[i]
        Y[i,c-1] =  1
    return Y


def precision_and_recall(Y_true, Y_pred, nclasses):
    # count true positives and false positives and false negatives
    TP_list = [0]*nclasses
    FP_list = [0]*nclasses
    FN_list = [0]*nclasses
    for j in range(nclasses):
       for i, pred in enumerate(Y_pred):
            if pred[j]==1 and Y_true[i][j]==1:
                TP_list[j] += 1
            elif pred[j]==1 and  Y_true[i][j]==0:
                FP_list[j] += 1
            elif pred[j]==0 and Y_true[i][j]==1:
                FN_list[j] += 1 

    return TP_list, FP_list, FN_list

def compute_f1_macro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    f1_scores = [0]*nclasses
    for k in range(nclasses):
        if TP_list[k]==0:
            continue
        f1_scores[k] = TP_list[k]/(TP_list[k]+0.5*(FP_list[k]+FN_list[k])) 
    return np.sum(f1_scores)/nclasses


def compute_f1_micro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    TP = np.sum(TP_list)
    FP = np.sum(FP_list)
    FN = np.sum(FN_list)
    print(TP, FP, FN)
    return TP/(TP + 0.5*(FN+FP))


N_classes = total_graph['N_classes']
mb = MultiLabelBinarizer(classes=[i for i in range(N_classes)])
f1_macro_list = []
f1_micro_list = []


# 5-fold cross validation
with torch.no_grad():
    for i in range(5):
        print(i)
        node_features_np = deepcopy(get_node_features(total_graph, mode="node_nr"))
        node_features_torch = torch.tensor(node_features_np).float()    
        training_nodes = NC_5folds[i]['train']
        test_nodes = NC_5folds[i]['test']
        Z_train = model(training_nodes, node_features_torch, neighborhoods)
        print("klar med z train")
        Z_test = model(test_nodes, node_features_torch, neighborhoods)
        X_train = Z_train
        X_test = Z_test
        # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
        if not total_graph['Multioutput']:
            Y_train_sequence = np.array([total_graph['groups'][node][0]  for node in training_nodes],dtype=int)
            Y_test_sequence = np.array([total_graph['groups'][node][0] for node in test_nodes], dtype=int)
            log_reg = LogisticRegression(multi_class="ovr", max_iter=200)
            Y_train = Y_train_sequence
            Y_test = Y_test_sequence
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
            Y_pred = onehot(Y_pred, total_graph['N_classes'])
            Y_test = onehot(Y_test, total_graph['N_classes'])
        else:
            print("hej")
            Y_train_sequence = [total_graph['groups'][node]  for node in training_nodes]
            Y_test_sequence = [total_graph['groups'][node] for node in test_nodes]
            Y_train = mb.fit_transform(Y_train_sequence)
            Y_test = mb.fit_transform(Y_test_sequence)
            log_reg = MultiOutputClassifier(SGDClassifier(max_iter=200))   #multi_class="ovr",
            log_reg.fit(X_train, Y_train)
            Y_pred = log_reg.predict(X_test)
    
        f1_macro = compute_f1_macro(Y_test, Y_pred, N_classes)
        f1_micro = compute_f1_micro(Y_test, Y_pred, N_classes)

        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)
        print(f1_macro, f1_micro)
        sys.exit()
        
    print(np.mean(f1_micro_list))
    print(np.mean(f1_macro_list))

0
klar med z train
hej
15 1 2887
0.005827505827505827 0.01028101439342015


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
