In [1]:

import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import random
from sklearn.model_selection import KFold
from load_data import load_toy, load_blogcatalog
from copy import deepcopy
import sys

import torch_geometric

In [2]:

dataset_name = "BlogCatalog"
data_dir = "../Data/" + dataset_name

total_graph = load_blogcatalog(data_dir)
#total_graph = load_toy(data_dir)


def get_similarity(total_graph):
    """Construct a dict of similar nodes, i.e. ones that are within distance 3 of each other"""
    similarity_dict = {i:set() for i in range(total_graph['N_nodes'])}
    for i in range(total_graph['N_nodes']):
        if i%1000==0:
            print(i)
        nb = total_graph['edges'][i]
        similarity_dict[i].update(nb)
        for n in nb:
            new_neighbors = total_graph['edges'][n]
            similarity_dict[i].update(new_neighbors)
            #for j in new_neighbors:
            #    similarity_dict[i].update(total_graph['edges'][j])       
    return similarity_dict

def get_node_features(total_graph, mode="degree"):
    if mode=="degree":
        node_features = np.zeros((total_graph['N_nodes'],2))
        for i in range(total_graph['N_nodes']):
            neighbors = total_graph['edges'][i]
            degree = len(neighbors)
            node_features[i,0] = degree
            second_neighbors_count = 0
            for n in neighbors:
                second_neighbors_count += len(total_graph['edges'][n])
            node_features[i,1] = second_neighbors_count
    elif mode=="degree_dist":
        node_features = total_graph['adj_matrix']  
    elif mode=="node_nr":
        node_features = np.zeros((total_graph['N_nodes'],total_graph['N_nodes']))
        for i in range(total_graph['N_nodes']):
            node_features[i,i] = 1
    return node_features


#sim_dict = get_similarity(total_graph)

In [11]:


class GraphSage(torch.nn.Module):
    def __init__(self, input_dim, output_dim, K):
        super(GraphSage, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_dim, output_dim))
        for i in range(K-1):
            self.layers.append(nn.Linear(output_dim, output_dim))
        self.K = K
    
    
    def forward(self, batch, neighborhoods):
        #need node_features, batch_nodes
        batch_nodes = batch.batch
        eps = 1e-9
        B = [[] for k in range(K)]
        B[K-1] = batch_nodes[:]
        for k in range(K-1, 0, -1):
            B[k-1] = B[k][:]
            for node in B[k][:]:
                B[k-1].extend(neighborhoods[node])

        h = batch.x 
        N_nodes = h.shape[0]
        for k in range(self.K):
            for v in B[k]:        
                neighborhood = neighborhoods[v]
                hv = h[v].view(1, -1)
                hN = h[neighborhood]
                conc = torch.cat([hN, hv], dim=0)
                aggregated = torch.mean(conc, dim=0, keepdim=True)
                layer = self.layers[k]
                hv = torch.nn.ReLU(layer(aggregated))
                h[v] = hv

            h = h / (torch.norm(h, axis=1, keepdims=True)+eps)
        return h


def compute_neighborhoods(edge_dict, N_nodes, nb_size):
    neighborhoods = [[] for _ in range(N_nodes)]
    for v, neighbors in edge_dict.items():
        nb = len(neighbors)
        sample_size = min(nb_size, nb)
        if sample_size == 1:
            sample_neighborhood = [neighbors[0]]
        else:
            neighborhood_ind = torch.randint(0, nb, (sample_size,))
            sample_neighborhood = [neighbors[i] for i in neighborhood_ind.tolist()]
        neighborhoods[v] = sample_neighborhood
    return neighborhoods



def custom_loss(Z, pos_samples, neg_samples):
    pass



In [14]:
from torch_geometric.loader import LinkNeighborLoader


node_features_np = get_node_features(total_graph, mode="degree_dist")
node_features_torch = torch.tensor(node_features_np)
node_links_torch = torch.tensor(total_graph['edges_list']).T
graph_data = torch_geometric.data.Data(node_features_torch, node_links_torch)

print(graph_data.x.shape)
# parameters
num_epochs = 1
K = 1   # number of iterations
batch_size = 128
input_size = node_features_np.shape[1]
output_dim = 64

# Create the model
model = GraphSage(input_size, output_dim, K)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    loader = LinkNeighborLoader(graph_data, batch_size=batch_size, num_neighbors=[10]*K, edge_label_index=graph_data.edge_index)  
    for subset in loader:
        optimizer.zero_grad()
        # Extract input and target from subset
        Z = model(input)
        loss = custom_loss(Z, pos_samples, negative_samples)
        loss.backward()
        optimizer.step()

torch.Size([10312, 10312])


TypeError: GraphSage.forward() missing 1 required positional argument: 'neighborhoods'

In [5]:

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":list(nodes[train_index]), "test":list(nodes[test_index])}


In [6]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler


def onehot(y, nclasses):
    Y = np.zeros((y.shape[0], nclasses), dtype=int)
    for i in range(y.shape[0]):
        c = y[i]
        Y[i,c-1] =  1
    return Y


def precision_and_recall(Y_true, Y_pred, nclasses):
    # count true positives and false positives and false negatives
    TP_list = [0]*nclasses
    FP_list = [0]*nclasses
    FN_list = [0]*nclasses
    for j in range(nclasses):
       for i, pred in enumerate(Y_pred):
            if pred[j]==1 and Y_true[i][j]==1:
                TP_list[j] += 1
            elif pred[j]==1 and  Y_true[i][j]==0:
                FP_list[j] += 1
            elif pred[j]==0 and Y_true[i][j]==1:
                FN_list[j] += 1 

    return TP_list, FP_list, FN_list

def compute_f1_macro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    f1_scores = [0]*nclasses
    for k in range(nclasses):
        if TP_list[k]==0:
            continue
        f1_scores[k] = TP_list[k]/(TP_list[k]+0.5*(FP_list[k]+FN_list[k])) 
    return np.sum(f1_scores)/nclasses


def compute_f1_micro(Y_true, Y_pred, nclasses):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred, nclasses)
    TP = np.sum(TP_list)
    FP = np.sum(FP_list)
    FN = np.sum(FN_list)
    print(TP, FP, FN)
    return TP/(TP + 0.5*(FN+FP))


N_classes = total_graph['N_classes']
mb = MultiLabelBinarizer(classes=[i for i in range(N_classes)])
f1_macro_list = []
f1_micro_list = []

# 5-fold cross validation
for i in range(5):
    print(i)
    node_features = get_node_features(total_graph, "degree_dist") 
    training_nodes = NC_5folds[i]['train']
    test_nodes = NC_5folds[i]['test']
    Z_train = tf.gather(batch_forward(node_features, training_nodes, K, model, neighborhoods), training_nodes).numpy()
    print("klar med z train")
    Z_test = tf.gather(batch_forward(node_features, test_nodes, K, model, neighborhoods), test_nodes).numpy()
    X_train = Z_train
    X_test = Z_test
    # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
    if not total_graph['Multioutput']:
        Y_train_sequence = np.array([total_graph['groups'][node][0]  for node in training_nodes],dtype=int)
        Y_test_sequence = np.array([total_graph['groups'][node][0] for node in test_nodes], dtype=int)
        log_reg = LogisticRegression(multi_class="ovr", max_iter=200)
        Y_train = Y_train_sequence
        Y_test = Y_test_sequence
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
        Y_pred = onehot(Y_pred, total_graph['N_classes'])
        Y_test = onehot(Y_test, total_graph['N_classes'])
    else:
        print("hej")
        Y_train_sequence = [total_graph['groups'][node]  for node in training_nodes]
        Y_test_sequence = [total_graph['groups'][node] for node in test_nodes]
        Y_train = mb.fit_transform(Y_train_sequence)
        Y_test = mb.fit_transform(Y_test_sequence)
        log_reg = MultiOutputClassifier(SGDClassifier(max_iter=200))   #multi_class="ovr",
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
  
    f1_macro = compute_f1_macro(Y_test, Y_pred, N_classes)
    f1_micro = compute_f1_micro(Y_test, Y_pred, N_classes)

    f1_macro_list.append(f1_macro)
    f1_micro_list.append(f1_micro)
    print(f1_macro, f1_micro)
    sys.exit()
    
print(np.mean(f1_micro_list))
print(np.mean(f1_macro_list))

KeyboardInterrupt: 