In [1]:
import os
import gc
import argparse
import time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from dgl.nn.pytorch.conv import SAGEConv

import pickle as pkl
import pandas as pd
import numpy as np

from dataclasses import dataclass, field, asdict, make_dataclass
from typing import List, Callable
from collections import defaultdict
from itertools import product

from importlib import reload

import utils
import graph_model
import graph_utils
reload(utils)
reload(graph_model)
reload(graph_utils)

from utils import get_metrics_dict
from graph_utils import build_graph, get_train_val_test_masks
from graph_model import GraphSAGE

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
PATH_TO_DATA = "/scratch/nh1724/" #"/scratch/mz2476/wiki/data/aligned_datasets/"
PATH_TO_MODELS = "/scratch/nh1724/"

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
device

'cuda'

In [4]:
# Load data
#load feature dataframes
with open(os.path.join(PATH_TO_DATA, "graph_df.pkl"), "rb") as f:
    wiki_graph_df = pkl.load(f)

with open(os.path.join(PATH_TO_DATA, "text_embed_en.pkl"), "rb") as f:
    wiki_feature_df = pkl.load(f)

with open(os.path.join(PATH_TO_DATA, "en_outlinks_tokens_df.pkl"), "rb") as f:
    wiki_label_df = pkl.load(f)
    
joined_df = wiki_feature_df.join(wiki_graph_df, lsuffix='1')
joined_df = joined_df.join(wiki_label_df, lsuffix='2').sort_values(by='node_id')

In [5]:
joined_df.head(2)

Unnamed: 0,QID1,text_1000_embed,QID2,node_id,to_nodes,QID,title,raw_outlinks,outlinks,raw_tokens,tokens,mid_level_categories
0,Q6199,"[tensor(-0.0119), tensor(-0.0165), tensor(-0.0...",Q6199,0,"[10810, 31108, 1477, 32954, 33284, 3969, 6429,...",Q6199,Anarchism,"[[[Anti-authoritarianism|anti-authoritarian]],...","[Anti-authoritarianism, political philosophy, ...","[anarchism, is, an, anti, authoritarianism, an...","[anarchism, anti, authoritarianism, anti, auth...","[History_And_Society.History and society, Hist..."
1,Q38404,"[tensor(-0.0052), tensor(-0.0246), tensor(-0.0...",Q38404,1,"[29931, 9899, 5124, 26669, 6874, 1103, 1103, 1...",Q38404,Autism,"[[[Psychiatry]], [[Interpersonal relationship|...","[Psychiatry, Interpersonal relationship, commu...","[autism, is, developmental, disorder, characte...","[autism, developmental, disorder, characterize...","[STEM.Medicine, STEM.Biology, History_And_Soci..."


In [6]:
joined_df.shape

(33823, 12)

In [7]:
%%time
G = build_graph(joined_df, directed=True)

CPU times: user 17.8 s, sys: 305 ms, total: 18.1 s
Wall time: 18.1 s


In [8]:
# load LDA learned topic dist
#check features
features = pkl.load(open(PATH_TO_DATA + "features.pkl", "rb"))
features.shape

(33823, 45)

In [33]:
features_top = pkl.load(open(PATH_TO_DATA + "features_top.pkl", "rb"))
features_top.shape

(33823, 4)

In [34]:
# Labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(joined_df.mid_level_categories)
labels = torch.FloatTensor(labels)

# Add train/val/test masks
train_mask, val_mask, test_mask = get_train_val_test_masks(G.number_of_nodes())

# Add features for all nodes
G.ndata['node_id'] = torch.arange(G.number_of_nodes())
embeds = np.nan_to_num(np.stack(joined_df.text_1000_embed.values), nan=0.)
G.ndata['_text_embed'] = torch.FloatTensor(embeds)

# Add features ONLY for train
G.ndata['_topics'] = labels * train_mask[:, None].float()
G.ndata['_empty'] = torch.zeros(G.number_of_nodes(), 0)

# Add features from LDA (for all nodes)
G.ndata['_LDA'] = torch.FloatTensor(features)

# Add LDA for train only
G.ndata['_LDA_train'] = torch.FloatTensor(features) * train_mask[:, None].float()

# Add features from LDA (for all nodes): top cateogory only
G.ndata['_LDA_top'] = torch.FloatTensor(features_top)

In [10]:
from functools import partial

@dataclass(frozen=True)
class Args:
    embedding_dim    : int = 150
    n_hidden         : int = 150
    n_layers         : int = 2
    aggregator_type  : str = "mean" # ``mean``, ``gcn``, ``pool``, ``lstm``
    activation       : Callable = partial(F.leaky_relu, negative_slope=0.1)

    n_classes    : int = labels.shape[1]
    num_nodes    : int = G.number_of_nodes()
    features_dim : int = 0

    lr           : float = 0.01
    weight_decay : float = 0.
    dropout      : float = 0.1
    step_size    : int = 200
    n_epochs     : int = 300
        
args = Args()

In [11]:
reload(graph_utils)

from graph_utils import predict, train_GraphSAGE, get_gpu_memory_map

In [12]:
# G.ndata["features"] = G.ndata['_text_embed'][:, 0:1]

# args = Args(
#             features_dim=G.ndata["features"].shape[1],
#             embedding_dim=10,
#             n_hidden=10,
#             n_layers=4,
#             aggregator_type="gcn",
#             lr=0.001,
#         )

# model = GraphSAGE(**asdict(args))


# model.to(device)
# labels = labels.to(device)
# G.ndata["features"] = G.ndata["features"].to(device)
# G.ndata["node_id"] = G.ndata["node_id"].to(device)
# train_mask = train_mask.to(device)
# val_mask = val_mask.to(device)

In [13]:
# list_features = [
#     ['_empty'],
#     ['_topics'],
#     ['_text_embed'],
#     ['_topics', '_text_embed']
# ]
# list_emb_dim = [100, 200, 300]
# list_n_hidden = [50, 100, 150]
# list_n_layers = [1, 2, 3]
# list_aggregator = [
# #     "mean",
# #     "gcn", 
# #     "pool",
#     "lstm"
# ]

In [14]:
features_names = ['_topics', '_text_embed']
G.ndata["features"] = torch.cat([G.ndata[name] for name in features_names], dim=1)
G.ndata["features"].shape

torch.Size([33823, 345])

In [35]:
list_features = [
    ['_LDA_top']
#    ['_empty'],
#    ['_LDA']
#     ['_topics'],
#     ['_text_embed'],
#     ['_topics', '_text_embed']
]
list_emb_dim = [200]
list_n_hidden = [200]
list_n_layers = [1]
list_aggregator = [
#     "mean",
    "gcn", 
#     "pool",
#     "lstm"
]

def run_grid_search(G, labels, train_mask, val_mask, device, FNAME):
    metrics_list = []

    for features_names, embedding_dim, n_hidden, n_layers, aggregator_type\
        in product(list_features, list_emb_dim, list_n_hidden, list_n_layers, list_aggregator):

        print(50*"--")
        print("features_names, embedding_dim, n_hidden, n_layers, aggregator_type:\n", 
              features_names, embedding_dim, n_hidden, n_layers, aggregator_type)

        if len(features_names) > 1:
            G.ndata["features"] = torch.cat([G.ndata[name] for name in features_names], dim=1)
        elif len(features_names) == 1:
            G.ndata["features"] = G.ndata[features_names[0]]
        else:
            raise ValueError

        args = Args(
            features_dim=G.ndata["features"].shape[1],
            embedding_dim=embedding_dim,
            n_hidden=n_hidden,
            n_layers=n_layers,
            aggregator_type=aggregator_type,
            n_epochs=400,
            dropout=0.2,
        )

        model = GraphSAGE(**asdict(args))


        model.to(device)
        labels = labels.to(device)
        G.ndata["features"] = G.ndata["features"].to(device)
        G.ndata["node_id"] = G.ndata["node_id"].to(device)
        train_mask = train_mask.to(device)
        val_mask = val_mask.to(device)

        use_loss_reweighting = False
        pos_weights = 1 / labels[train_mask].mean(axis=0) if use_loss_reweighting else None
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)
        model_parameters = [p for p in model.parameters() if p.requires_grad]
        optimizer = torch.optim.Adam(model_parameters, lr=args.lr, weight_decay=args.weight_decay)
        exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=0.01)

        metrics = train_GraphSAGE(model, criterion, optimizer, exp_lr_scheduler, 
                        device, "test", asdict(args), args.n_epochs,
                        G, labels, train_mask, val_mask)
        metrics_list.append(metrics)
        torch.save(metrics_list, PATH_TO_MODELS + FNAME)
        
        
        print(get_gpu_memory_map())
        gc.collect()
        torch.cuda.empty_cache()
  
    return metrics_list

In [27]:
#add LDA as feature: f1-score 64.7%
#try 1 or 2 hidden layers
d = run_grid_search(G, labels, train_mask, val_mask, device, "results.pth")

----------------------------------------------------------------------------------------------------
features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
 ['_empty'] 200 200 1 gcn


  'precision', 'predicted', average, warn_for)


Epoch 99 | Train Loss: 0.05277 | Validation f1_micro 0.637
Epoch 199 | Train Loss: 0.03009 | Validation f1_micro 0.611
Epoch 299 | Train Loss: 0.02974 | Validation f1_micro 0.61
Epoch 399 | Train Loss: 0.02962 | Validation f1_micro 0.608

Training complete in 0m 40s
Best val f1_micro: 0.6430 

{0: 1113}
----------------------------------------------------------------------------------------------------
features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
 ['_empty'] 200 200 2 gcn


  'precision', 'predicted', average, warn_for)


Epoch 99 | Train Loss: 0.06978 | Validation f1_micro 0.617
Epoch 199 | Train Loss: 0.04889 | Validation f1_micro 0.621
Epoch 299 | Train Loss: 0.04824 | Validation f1_micro 0.62
Epoch 399 | Train Loss: 0.04815 | Validation f1_micro 0.625

Training complete in 0m 43s
Best val f1_micro: 0.6330 

{0: 1239}
----------------------------------------------------------------------------------------------------
features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
 ['_LDA'] 200 200 1 gcn


  'precision', 'predicted', average, warn_for)


Epoch 99 | Train Loss: 0.0542 | Validation f1_micro 0.644
Epoch 199 | Train Loss: 0.03088 | Validation f1_micro 0.616
Epoch 299 | Train Loss: 0.03068 | Validation f1_micro 0.618
Epoch 399 | Train Loss: 0.03031 | Validation f1_micro 0.614

Training complete in 0m 42s
Best val f1_micro: 0.6470 

{0: 1183}
----------------------------------------------------------------------------------------------------
features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
 ['_LDA'] 200 200 2 gcn


  'precision', 'predicted', average, warn_for)


Epoch 99 | Train Loss: 0.06927 | Validation f1_micro 0.624
Epoch 199 | Train Loss: 0.04849 | Validation f1_micro 0.627
Epoch 299 | Train Loss: 0.04788 | Validation f1_micro 0.621
Epoch 399 | Train Loss: 0.04764 | Validation f1_micro 0.624

Training complete in 0m 42s
Best val f1_micro: 0.6330 

{0: 1287}


In [36]:
#add LDA and topics as train feature: f1-score 
d = run_grid_search(G, labels, train_mask, val_mask, device, "results_lda_top.pth")

----------------------------------------------------------------------------------------------------
features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
 ['_LDA_top'] 200 200 1 gcn


  'precision', 'predicted', average, warn_for)


Epoch 99 | Train Loss: 0.05297 | Validation f1_micro 0.641
Epoch 199 | Train Loss: 0.03025 | Validation f1_micro 0.61
Epoch 299 | Train Loss: 0.02991 | Validation f1_micro 0.615
Epoch 399 | Train Loss: 0.02968 | Validation f1_micro 0.61

Training complete in 0m 41s
Best val f1_micro: 0.6460 

{0: 1165}


In [22]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


In [23]:
# features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
#  ['_empty'] 100 150 2 mean
# Epoch 99 | Train Loss: 0.05154 | Validation f1_micro 0.601
# Epoch 199 | Train Loss: 0.008532 | Validation f1_micro 0.598
# Epoch 299 | Train Loss: 0.008263 | Validation f1_micro 0.594
# Epoch 399 | Train Loss: 0.008067 | Validation f1_micro 0.592
# Training complete in 0m 43s
# Best val f1_micro: 0.6050 

# features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
#  ['_empty'] 100 150 1 gcn
# Epoch 99 | Train Loss: 0.06535 | Validation f1_micro 0.626
# Epoch 199 | Train Loss: 0.03704 | Validation f1_micro 0.624
# Epoch 299 | Train Loss: 0.03665 | Validation f1_micro 0.625
# Epoch 399 | Train Loss: 0.03643 | Validation f1_micro 0.624

# Training complete in 0m 24s
# Best val f1_micro: 0.6410 


# features_names, embedding_dim, n_hidden, n_layers, aggregator_type:
#  ['_empty'] 300 150 1 gcn
# Epoch 99 | Train Loss: 0.0509 | Validation f1_micro 0.643
# Epoch 199 | Train Loss: 0.02739 | Validation f1_micro 0.606
# Epoch 299 | Train Loss: 0.02689 | Validation f1_micro 0.603
# Epoch 399 | Train Loss: 0.02658 | Validation f1_micro 0.604

# Training complete in 0m 31s
# Best val f1_micro: 0.6430 

In [24]:
if not os.path.exists(PATH_TO_MODELS + "test"):
    os.mkdir(PATH_TO_MODELS + "test")

In [25]:
d = train_GraphSAGE(model, criterion, optimizer, exp_lr_scheduler, 
                    device, "test", asdict(args), args.n_epochs,
                    G, train_mask, val_mask)

NameError: name 'model' is not defined

In [None]:
# import subprocess

# def get_gpu_memory_map():
#     """Get the current gpu usage.

#     Returns
#     -------
#     usage: dict
#         Keys are device ids as integers.
#         Values are memory usage as integers in MB.
#     """
#     result = subprocess.check_output(
#         [
#             'nvidia-smi', '--query-gpu=memory.used',
#             '--format=csv,nounits,noheader'
#         ], encoding='utf-8')
#     # Convert lines into a dictionary
#     gpu_memory = [int(x) for x in result.strip().split('\n')]
#     gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
#     return gpu_memory_map

In [None]:
# def pretty_size(size):
# 	"""Pretty prints a torch.Size object"""
# 	assert(isinstance(size, torch.Size))
# 	return " × ".join(map(str, size))

# def dump_tensors(gpu_only=True):
# 	"""Prints a list of the Tensors being tracked by the garbage collector."""
# 	import gc
# 	total_size = 0
# 	for obj in gc.get_objects():
# 		try:
# 			if torch.is_tensor(obj):
# 				if not gpu_only or obj.is_cuda:
# 					print("%s:%s%s %s" % (type(obj).__name__, 
# 										  " GPU" if obj.is_cuda else "",
# 										  " pinned" if obj.is_pinned else "",
# 										  pretty_size(obj.size())))
# 					total_size += obj.numel()
# 			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
# 				if not gpu_only or obj.is_cuda:
# 					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
# 												   type(obj.data).__name__, 
# 												   " GPU" if obj.is_cuda else "",
# 												   " pinned" if obj.data.is_pinned else "",
# 												   " grad" if obj.requires_grad else "", 
# 												   " volatile" if obj.volatile else "",
# 												   pretty_size(obj.data.size())))
# 					total_size += obj.data.numel()
# 		except Exception as e:
# 			pass        
# 	print("Total size:", total_size)

In [None]:
# torch.save({
#     'state_dict': model.state_dict(),
#     'options': options,
#         }, f'{PATH_TO_MODELS}/node_id.pt')
# print("Model saved.")

In [None]:
# load_pretrained = True

# if load_pretrained:
#     if device == 'cuda':
#         model_pt = torch.load(f'{PATH_TO_MODELS}/node_id_topics.pt')
#     else:
#         model_pt = torch.load(f'{PATH_TO_MODELS}/node_id_topics.pt', map_location=torch.device('cpu'))
#     options = model_pt['options']
    
#     model = GraphSAGE(**options)
#     model.load_state_dict(model_pt['state_dict'])
#     model.to(device)

# y_pred = (torch.exp(model(G)) > threshold).float()
# get_metrics_dict(labels[val_mask].cpu(), y_pred[val_mask].cpu())
    