In [None]:
from google.colab import drive
drive.mount('/content/drive')
dataset_dir = '/content/drive/My Drive/abstracts'
 #directory of all folders with the abstracts
 #directory is structured like:
 #/abstracts/
 #          /1992
 #          /1993
 #          /1994
 #          /1995
 #          /1996
 #          /1997
 #          /1998
 #          /1999
 #          /2000
 #          /2001
 #          /2002
 #          /2003

Mounted at /content/drive


Use GPU:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import nltk
#nltk is a natural language tool helpful for tokenizing

nltk.download("book")
### Check if a cuda GPU is available
if torch.cuda.is_available():
    print('GPU availble')
    # Define the device (here you can select which GPU to use if more than 1)
    device = torch.device("cuda")
else:
    print('GPU not availble')
    device = torch.device("cpu")

print(f"SELECTED DEVICE: {device}")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    

GPU not availble
SELECTED DEVICE: cpu


[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection book


Install StellarGraph:

In [None]:
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.2.1

import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import itertools
import os

import matplotlib.pyplot as plt

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.calibration import expected_calibration_error, plot_reliability_diagram
from stellargraph.calibration import IsotonicCalibration, TemperatureCalibration

from tensorflow import keras
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.calibration import calibration_curve
from sklearn.isotonic import IsotonicRegression

from stellargraph import globalvar
from stellargraph import datasets
from IPython.display import display, HTML
from  stellargraph import StellarGraph
import pandas as pd


%matplotlib inline

Graph extraction block:

In [None]:
nodes = []        #list of all nodes
nodes_feat = {}   #dictionary of all node features to use for networkx
edge_feats = {}   #dictionary of all edge features to use for networkx
source = []       #first end of the edges
target = []       #second end of the edges


os.chdir(dataset_dir)
for file in os.listdir(): #for cycle that iterates the folders of the years
  dir_name = str(file)    #year 
  os.chdir(dataset_dir + "/" + dir_name)
  print("Processing abstracts of year " + dir_name)
  for file in os.listdir(): #for cycle that iterates the abstracts of the papers
    abstract = dataset_dir + "/" + dir_name + "/" + str(file)
    with open(abstract, "r") as file1:    
      authors = []                        #list of all authors of one paper
      for line in file1:                  #for cycle that iterates the lines of the abstract file                   
        tokens = nltk.word_tokenize(line) #tokenize line
        if(len(tokens) > 0):
          if(tokens[0] == "Authors" or tokens[0] == "Author"): #check if we are on the authors line
            author = ""
            skippable = False              #true if we are inside two parenthesis, so not an author name
            for i in range(2,len(tokens)): #skip "Authors" and ":"
              if(tokens[i][0] == "("):     #we are now inside two parenthesis
                skippable = True
              if(not(skippable)):
                if(tokens[i] != "and" and tokens[i] != ","): #ignore "and" and ","
                  author = author + tokens[i]
                else:
                  if(len(author) != 0):
                    authors.append(author)
                  author = ""
              if(tokens[i][len(tokens[i]) - 1] == ")"):      #end of parenthesis
                skippable = False
            if(len(author) != 0):      #add last author, if any
              authors.append(author)
            break                      #authors are only in one row, we can skip the rest
      for i in range(0, len(authors)): #add new nodes 
        if(not(authors[i] in nodes)):
          nodes.append(authors[i])
          nodes_feat.update({authors[i] : {"author_code" : hash(authors[i])}})

      for i in range(0, len(authors)):  #check every combination of two nodes
        for j in range(i+1, len(authors)):
          if(not((authors[i], authors[j]) in edge_feats) and not((authors[j], authors[i]) in edge_feats)): #add new edges
            source.append(authors[i])
            target.append(authors[j])
            edge_feats.update({(authors[i], authors[j]):{"year" : int(dir_name), "collabs" : 1}})
          else: 
            if ((authors[i], authors[j]) in edge_feats.keys()):
              if(edge_feats.get((authors[i], authors[j])).get("year") ==int(dir_name)):  #check if there were already collaborations between i and j this year
                n_coll = edge_feats.get((authors[i], authors[j])).get("collabs")
                edge_feats.update({(authors[i], authors[j]):{"year" : int(dir_name), "collabs" : n_coll + 1}})
            if ((authors[j], authors[i]) in edge_feats.keys()):
              if(edge_feats.get((authors[j], authors[i])).get("year") ==int(dir_name)):
                n_coll = edge_feats.get((authors[j], authors[i])).get("collabs")
                edge_feats.update({(authors[j], authors[i]):{"year" : int(dir_name), "collabs" : n_coll + 1}})


author_codes = [] #list of hash codes of the authors names
years = []        #list of years corresponding to the edges
collabs = []      #list of collaborations corresponding to the edges
nodes_index = []  #list of names of the nodes
edges = []        #list of all edges

for element in edge_feats:  #split edge_feats into lists to use for StellarGraph
  edges.append((element[0], element[1], dict(year=edge_feats.get(element).get("year"), collabs=edge_feats.get(element).get("collabs"))))
  years.append(edge_feats.get(element).get("year"))
  collabs.append(edge_feats.get(element).get("collabs"))

G = nx.MultiGraph()  #original graph
Gs = nx.MultiGraph() #graph without nodes with degree = 0

G.add_nodes_from(nodes)  #add nodes
nx.set_node_attributes(G, nodes_feat)  #add node features
G.add_edges_from(edges)

Gs.add_nodes_from(nodes)
nx.set_node_attributes(Gs, nodes_feat)
Gs.add_edges_from(edges)


for n in G.nodes:  #remove nodes with degree = 0
  if(nx.degree(G,n) == 0):
    Gs.remove_node(n)     
  else:
    nodes_index.append(n)


print("Number of connected components: " + str(nx.number_connected_components(Gs)))
node_feats = [] #list of all lists of node features to use for StellarGraph

#list of graph features to use for StellarGraph
deg = []
har_cen = []
deg_cen = []
clos_cen = []
bet_cen = []
load_cen = []
square_clus = []
pagerank = []
constr = []

edge_load_cen = []

#graph features are calculated just once, to speed up the program
node_feats.append(nx.get_node_attributes(Gs,"author_code"))
node_feats.append(nx.degree(Gs))
node_feats.append(nx.harmonic_centrality(Gs))
node_feats.append(nx.degree_centrality(Gs))
node_feats.append(nx.closeness_centrality(Gs))
node_feats.append(nx.betweenness_centrality(Gs))
node_feats.append(nx.load_centrality(Gs))
node_feats.append(nx.square_clustering(Gs))
node_feats.append(nx.pagerank_numpy(Gs))
node_feats.append(nx.constraint(Gs))

dict_edge_load_cen = nx.edge_load_centrality(Gs)


for n in Gs.nodes:  #add graph features in their respective list
  author_codes.append(node_feats[0][n])
  deg.append(node_feats[1][n])
  har_cen.append(node_feats[2][n])
  deg_cen.append(node_feats[3][n])
  clos_cen.append(node_feats[4][n])
  bet_cen.append(node_feats[5][n])
  load_cen.append(node_feats[6][n])
  square_clus.append(node_feats[7][n])
  pagerank.append(node_feats[8][n])
  constr.append(node_feats[9][n])

for e in Gs.edges:
  edge_load_cen.append(dict_edge_load_cen[e[0:2]])

square_node_data = pd.DataFrame( #data structure for the nodes
    {"author_code": author_codes,"degree":deg, "harmonic_centrality":har_cen, "degree_centrality":deg_cen, 
    "closeness_centrality": clos_cen, "betweenness_centrality":bet_cen, "load_centrality":load_cen,
     "square_clustering":square_clus,  "pagerank": pagerank,  "constraint": constr},
     index = nodes_index)

square_edges = pd.DataFrame( #data structure for the edges
    {"source": source, "target": target, "year": years, "collab": collabs, "load_centrality": edge_load_cen}
)

G_stel = StellarGraph(square_node_data, square_edges) #build graph
print(G_stel.info())

#saving the data
np.save("/content/author_codes.npy",author_codes)
np.save("/content/deg.npy",deg)
np.save("/content/har_cen.npy",har_cen)
np.save("/content/deg_cen.npy",deg_cen)
np.save("/content/clos_cen.npy",clos_cen)
np.save("/content/bet_cen.npy",bet_cen)
np.save("/content/load_cen.npy",load_cen)
np.save("/content/square_clus.npy",square_clus)
np.save("/content/pagerank.npy",pagerank)
np.save("/content/constr.npy",constr)
np.save("/content/nodes_index.npy",nodes_index)
np.save("/content/source.npy",source)
np.save("/content/target.npy",target)
np.save("/content/years.npy",years)
np.save("/content/collabs.npy",collabs)
np.save("/content/edge_load_cen.npy",edge_load_cen)
np.save("/content/G_stel.npy",G_stel)

Processing abstracts of year 1992
Processing abstracts of year 1993
Processing abstracts of year 1994
Processing abstracts of year 1995
Processing abstracts of year 1996
Processing abstracts of year 1997
Processing abstracts of year 1998
Processing abstracts of year 1999
Processing abstracts of year 2000
Processing abstracts of year 2001
Processing abstracts of year 2002
Processing abstracts of year 2003
Number of connected components: 931


  node_feats.append(nx.pagerank_numpy(Gs))
NetworkX version 3.0.
  M = google_matrix(


StellarGraph: Undirected multigraph
 Nodes: 12061, Edges: 23834

 Node types:
  default: [12061]
    Features: float32 vector, length 10
    Edge types: default-default->default

 Edge types:
    default-default->default: [23834]
        Weights: all 1 (default)
        Features: float32 vector, length 3
