# Introduction

Full text only

In [1]:
""" Put arXiv data into right form for the GCN """

import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import arxiv_public_data.tests.intra_citation as ia 
import time, json, gzip
import pickle as pkl
from arxiv_public_data.oai_metadata import load_metadata


                                #Auxiliary
#-----------------------------------------------------------------------


def sync_G_with_metadata(G,m):
    """ Citation graph G is missing some articles
        (the ones for which we have no full-text data)
        So, I need to add these missing ones as isolated
        nodes into G.
        
        G = nx.Graph, citation graph
        m = meta data  (get from load_metadata function)
    """
    
    #Check if G subset of m. 
    #It should be; if not, remove the bad noes
    G_nodes = list(G.nodes())
    ids = [x['id'] for x in m]
    bad_nodes = set(G_nodes) - set(ids)
    for node in bad_nodes:
        G.remove_node(node)
        
    #Then find missing nodes & add them
    missing = set(ids) - set(G_nodes)
    for node in missing:
        if not G.has_node(node):
            G.add_node(node)
    
    #Check if they're sync'd up
    print('(#Nodes in citation graph G, #articles in meta-data) = ' + str((G.number_of_nodes(),len(m))))
    if G.number_of_nodes() == len(m):
        print('So the syncing worked')
    else:
        print('So the syncing did not work')
    return G


def clean_labels(labels):
    """ Some labels have multiple listings
        so I take the first one
        
        Input: list of strings
    """

    for i,label in enumerate(labels):

        #If multiple listings, take first
        label = label[0].split()[0]

        #Merge sub-classes
        label = label[:label.find('.')]

        labels[i] = label
    return labels


def labels2categorical(labels):
    """ labels are strings -- have form
        'hep-th' -- So need to covert to
        categoricals
    """
        
    #Create mapping
    classes = set(labels)
    class_labels = {}
    for i,x in enumerate(classes):
        class_labels[x] = i
    class_labels
    
    #change
    labels_categorical = []
    for label in labels:
        vec = np.zeros(len(classes))
        temp = class_labels[label]
        vec[temp] = 1
        labels_categorical.append(vec)
    return np.array(labels_categorical)


def load_titles(dirname):
    
    #Load the full feature matrix
    filename = dirname + '/title-embedding-usel-2019-03-19.pkl'
    out = []
    with open(filename, 'rb') as f:
        while True:
            try:
                out.extend(pkl.load(f))
            except EOFError as e:
                break
    title_vecs = np.array(out)
    return title_vecs
    


def load_abstracts(dirname):
    
    #Load the full feature matrix
    filename = dirname + '/abstract-embedding-usel-2019-03-19.pkl'
    out = []
    with open(filename, 'rb') as f:
        while True:
            try:
                out.extend(pkl.load(f))
            except EOFError as e:
                break
    abstract_vecs = np.array(out)
    return abstract_vecs


def load_fulltext(dirname):
    
    #Load the full feature matrix
    filename = dirname + '/fulltext-embedding-usel-2-headers-2019-04-05.pkl'
    out = []
    with open(filename, 'rb') as f:
        while True:
            try:
                out.extend(pkl.load(f))
            except EOFError as e:
                break
    fulltext_vecs = np.array(out)
    return fulltext_vecs


def load_labels(m):
    labels = [x['categories'] for x in m]
    labels_cl = clean_labels(labels)
    labels_cat = labels2categorical(labels_cl)
    return labels_cat


def save_data(dirname,vector_type, vector_train, vector_test, vector, G, m):
    """ Saves data in format required by Kipfs and Welling
    
        nodes_int = list, list of nodes labeled by integers
        dirname = string, where to save the data
        vector_type = string, = 'title', 'abstract', 'full-text'
    
    """

    #Save vectors
    dirname = 'data'
    fname = dirname + '/ind.arXiv-' + vector_type + '.x'
    pkl.dump(vector_train, open(fname,'wb'))

    fname = dirname + '/ind.arXiv-' + vector_type + '.tx'
    pkl.dump(vector_test, open(fname,'wb'))

    fname = dirname + '/ind.arXiv-' + vector_type + '.allx'
    pkl.dump(vector[:cutoff2], open(fname,'wb'))

    fname = dirname + '/ind.arXiv-' + vector_type + '.y'
    pkl.dump(np.array(labels_train), open(fname,'wb'))

    fname = dirname + '/ind.arXiv-' + vector_type + '.ty'
    pkl.dump(np.array(labels_test), open(fname,'wb'))

    fname = dirname + '/ind.arXiv-' + vector_type + '.ally'
    pkl.dump(np.array(labels_cat[:cutoff2]), open(fname,'wb'))

    nodes_as_ints = range(G.number_of_nodes())  #kipf welling label nodes as ints
    test_nodes = nodes_as_ints[cutoff2:]
    with open(dirname + '/ind.arXiv-' + vector_type + '.test.index','wt') as f:
        for node in test_nodes:
            f.write(str(node))
            f.write('\n')
            
            
    #Save graph in format required by Kipf-Welling
    #Also, need to save in same order as metadata
    #Note, I used protocal 4 with pickle, since there's 
    # a 4GB limit on what you can pickle without it.
    graph_dict = {}     
    for item in m:
        node = item['id']
        graph_dict[node] = list(G.neighbors(node))
    pkl.dump(graph_dict, open(dirname + '/ind.arXiv-' + vector_type + '.graph', 'wb'), protocol=4) 
    return
    


                                #Main
#-----------------------------------------------------------------------

if __name__ == "__main__":

    
    #Graph
    t1 = time.time()
    dirname = '/home/kokeeffe/research/arxiv-public-datasets/arxiv-data/output'
    fname = dirname + '/internal-citations.json.gz'
    q = json.load(gzip.open(fname, 'rt', encoding='utf-8'))
    G = ia.makegraph(q)
    
    #Meta data
    dirname = '/home/kokeeffe/research/arxiv-public-datasets/arxiv-data'
    m = load_metadata( dirname + '/oai-arxiv-metadata-2019-03-01.json.gz')
    
    #Make sure G is sync'd with the metadata (has same of the nodes)
    G = sync_G_with_metadata(G,m)

    t2 = time.time()
    print('Loading graph took ' + str((t2-t1)/60.0) + ' mins')


    #Load features
    t1 = time.time()
    dirname = '/home/kokeeffe/research/arxiv-public-datasets/arxiv-data/output/embeddings'
    title_vecs = load_titles(dirname)
    abstract_vecs = load_abstracts(dirname)
    #fulltext_vecs = load_fulltext(dirname)
    
    #Load labels
    dirname = '/home/kokeeffe/research/arxiv-public-datasets/arxiv-data'
    labels_cat = load_labels(m)
    t2 = time.time()
    print( 'Loading features & labels took ' + str((t2-t1)/60.0) + ' mins')

    #Split into test & train & ulabeled portion
    #For now, I'll assume that nothing is unlabeled
    #That means cutoff1 and cutoff2 are the same
    t1 = time.time()
    cutoff1 = int(0.9*title_vecs.shape[0]) 
    cutoff2 = int(0.9*title_vecs.shape[0])
    title_vec_train, title_vec_test = title_vecs[:cutoff1], title_vecs[cutoff2:]
    abstract_vec_train, abstract_vec_test = abstract_vecs[:cutoff1], abstract_vecs[cutoff2:]
    #fulltext_vec_train, fulltext_vec_test = fulltext_vecs[:cutoff1], fulltext_vecs[cutoff2:]
    labels_train, labels_test = labels_cat[:cutoff1], labels_cat[cutoff2:]

    #Save data
    dirname = 'data'
    save_data(dirname, 'title', title_vec_train, title_vec_test, title_vecs, G, m)
    save_data(dirname, 'abstract', abstract_vec_train, abstract_vec_test, abstract_vecs, G, m)

    #Combine title and abstract vecs
    title_vecs = np.concatenate((title_vecs, abstract_vecs), axis=1)
    title_vec_train, title_vec_test = title_vecs[:cutoff1], title_vecs[cutoff2:]
    save_data(dirname, 'title-abstract', title_vec_train, title_vec_test, title_vecs, G, m)
    
    t2 = time.time()
    print( 'Saving data took ' + str((t2-t1)/60.0) + ' mins')



(#Nodes in citation graph G, #articles in meta-data) = (1506500, 1506500)
So the syncing worked
Loading graph took 1.2567044099171956 mins
Loading features & labels took 0.4854541182518005 mins


OverflowError: cannot serialize a bytes object larger than 4 GiB