# Enviorment Installation 
In order to load dataset and run the code, you need to install the following packages:
Dependencies (with python >= 3.9): Main dependencies are
pytorch==1.13
torch_geometric==2.2.0
torch-scatter==2.1.1+pt113cpu
torch-sparse==0.6.17+pt113cpu
torch-spline-conv==1.2.2+pt113cpu

# Dataset Download 
There are three available datasets for link prediction: Cora, Pubmed and Arxiv. Please download them from the following link:



In [1]:
from yacs.config import CfgNode as CN
import numpy as np
import torch
import random
import os, sys 
import pandas as pd 
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

cfg = CN()

cfg.dataset = CN()
cfg.dataset.cora = CN()
cfg.dataset.cora.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.cora.original = cfg.dataset.cora.root + '/dataset/cora_orig/cora'
cfg.dataset.cora.papers =  cfg.dataset.cora.root + '/dataset/cora_orig/mccallum/cora/papers'
cfg.dataset.cora.extractions =  cfg.dataset.cora.root + '/dataset/cora_andrew_mccallum/extractions/'
cfg.dataset.cora.lm_model_name = 'microsoft/deberta-base'
# ------------------------------------------------------------------------ #
cfg.dataset.pubmed = CN()
cfg.dataset.pubmed.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.pubmed.original = cfg.dataset.pubmed.root  + '/dataset/PubMed_orig/data/'
cfg.dataset.pubmed.abs_ti = cfg.dataset.pubmed.root  + '/dataset/PubMed_orig/pubmed.json' 

cfg.dataset.arxiv = CN()
cfg.dataset.arxiv.root = '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE' #'PATH_TO_DATASET' 
cfg.dataset.arxiv.abs_ti = cfg.dataset.arxiv.root + '/dataset/ogbn_arxiv_orig/titleabs.tsv'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cfg

CfgNode({'dataset': CfgNode({'cora': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'original': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_orig/cora', 'papers': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_orig/mccallum/cora/papers', 'extractions': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/cora_andrew_mccallum/extractions/', 'lm_model_name': 'microsoft/deberta-base'}), 'pubmed': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'original': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/PubMed_orig/data/', 'abs_ti': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/PubMed_orig/pubmed.json'}), 'arxiv': CfgNode({'root': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE', 'abs_ti': '/pfs/work7/workspace/scratch/cc7738-prefeature/TAPE/dataset/ogbn_arxiv_orig/titleabs.tsv'})})})

In [3]:
def seed_everything(SEED=0):
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

In [4]:
def load_ab_ti(path, fn):
    ti, ab = '', ''
    with open(path + fn) as f:
        lines = f.read().splitlines()
    for line in lines:
        if line.split(':')[0] == 'Title':
            ti = line
        elif line.split(':')[0] == 'Abstract':
            ab = line
    return ti, ab

In [5]:
def get_raw_text_cora(cfg, use_text=False, seed=0):
    # load data 
    path_papers = cfg.dataset.cora.papers
    andrew_maccallum_path = cfg.dataset.cora.extractions 
    dataset = Planetoid('dataset', 'cora',
                        transform=T.NormalizeFeatures())
    data = dataset[0]
    print(data)
    
    # load data_citeid 
    path = cfg.dataset.cora.original
    idx_features_labels = np.genfromtxt(
        "{}.content".format(path), dtype=np.dtype(str))
    data_X = idx_features_labels[:, 1:-1].astype(np.float32)
    labels = idx_features_labels[:, -1]
    data_citeid = idx_features_labels[:, 0]
    
    if not use_text:
        return data, None

    with open(path_papers) as f:
        lines = f.readlines()
    pid_filename = {}
    for line in lines:
        pid = line.split('\t')[0]
        fn = line.split('\t')[1]
        pid_filename[pid] = fn

    text = []
    whole, founded = len(data_citeid), 0
    no_ab_or_ti = 0
    for pid in data_citeid:
        fn = pid_filename[pid]
        ti, ab = load_ab_ti(andrew_maccallum_path, fn)
        founded += 1
        text.append(ti + '\n' + ab)

        if ti == '' or ab == '':
            # print(f"no title {ti}, no abstract {ab}")
            no_ab_or_ti += 1
    print(f"found {founded}/{whole} papers, {no_ab_or_ti} no ab or ti.")
    return data, text
