# Some fun with unstructured text

## Getting some data

In [1]:
import urllib, urllib.request
import xml.etree.ElementTree as et 

In [2]:
# arxiv parameters
arxiv_endpoint_template='http://export.arxiv.org/api/query?search_query=all:graph%20machine%20learning&start={start_index}&max_results={max_results}'
start_index=0
max_results=5

In [3]:
url = arxiv_endpoint_template.format(start_index=start_index, max_results = max_results)
raw = urllib.request.urlopen(url)
data = raw.read().decode('utf-8')

In [4]:
xtree = et.fromstring(data)

In [5]:
ns = { 'Atom' : 'http://www.w3.org/2005/Atom' }
papers=[] 

for node in xtree:
  if node.tag.endswith('entry'):
    #print(node.tag, node.attrib)
    paperId = node.find('Atom:id', ns).text
    categories = [];

    for cat in node.findall('Atom:category', ns):
      categories.append(cat.get('term'))
    
    authors=[]
    for aut in node.findall('Atom:author', ns):
      authors.append(aut.find('Atom:name',ns).text)

    papers.append({ 'id'        : paperId ,
                    'title'     : node.find('Atom:title', ns).text , 
                    'published' : node.find('Atom:published', ns).text ,
                    'summary'   : node.find('Atom:summary', ns).text,
                    'categories': categories,
                    'authors'   : authors
                  })    
papers[0]

{'id': 'http://arxiv.org/abs/2201.01288v1',
 'title': 'Automated Graph Machine Learning: Approaches, Libraries and Directions',
 'published': '2022-01-04T18:31:31Z',
 'summary': "  Graph machine learning has been extensively studied in both academic and\nindustry. However, as the literature on graph learning booms with a vast number\nof emerging methods and techniques, it becomes increasingly difficult to\nmanually design the optimal machine learning algorithm for different\ngraph-related tasks. To tackle the challenge, automated graph machine learning,\nwhich aims at discovering the best hyper-parameter and neural architecture\nconfiguration for different graph tasks/data without manual design, is gaining\nan increasing number of attentions from the research community. In this paper,\nwe extensively discuss automated graph machine approaches, covering\nhyper-parameter optimization (HPO) and neural architecture search (NAS) for\ngraph machine learning. We briefly overview existing libr

## Neo4j Setup

In [6]:
import pandas as pd
from neo4j import GraphDatabase, RoutingControl # Python database driver

In [7]:

DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "demo"

In [8]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
driver.verify_connectivity()

In [9]:
driver.execute_query(
    'create database {dbname} if not exists'.format(dbname = DB_NAME),
    None,
    RoutingControl.WRITE,
    database_= 'system'
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x15dc6abe0>, keys=[])

## Graph Creation

In [10]:
# This is not really required for this small sample
driver.execute_query(
    'create constraint if not exists for (p:Paper) require (p.id) is node key',
    None,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x11339a790>, keys=[])

In [11]:
records, summary, keys = driver.execute_query(
    '''
    unwind $papers as paper
    create (p:Paper{id:paper.id})
        set p.title=paper.title,
            p.published=datetime(paper.published),
            p.summary=paper.summary,
            p.categories=paper.categories
    foreach (author in paper.authors | 
        create (p)<-[:wrote_paper]-(:Author{name: author})
    )
    return paper{.*}
    ''',
    papers = papers,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)
for record in records:
    print(record)

<Record paper={'summary': "  Graph machine learning has been extensively studied in both academic and\nindustry. However, as the literature on graph learning booms with a vast number\nof emerging methods and techniques, it becomes increasingly difficult to\nmanually design the optimal machine learning algorithm for different\ngraph-related tasks. To tackle the challenge, automated graph machine learning,\nwhich aims at discovering the best hyper-parameter and neural architecture\nconfiguration for different graph tasks/data without manual design, is gaining\nan increasing number of attentions from the research community. In this paper,\nwe extensively discuss automated graph machine approaches, covering\nhyper-parameter optimization (HPO) and neural architecture search (NAS) for\ngraph machine learning. We briefly overview existing libraries designed for\neither graph machine learning or automated machine learning respectively, and\nfurther in depth introduce AutoGL, our dedicated and 

## Extract graph from summary

In [12]:
import openai
from retry import retry

In [None]:
openai.api_key = "sk-....."

In [22]:
df_papers = driver.execute_query(
    '''
    match (p:Paper) where p.processed is null
    return p.id as id, p.summary as summary
    ''',
    None,
    routing_= RoutingControl.READ,
    database_= DB_NAME,
    result_transformer_= lambda r: r.to_df()
)
pd.set_option('display.max_colwidth',0)
df_papers.head()

Unnamed: 0,id,summary
0,http://arxiv.org/abs/2004.06846v1,"How to utilize deep learning methods for graph classification tasks has\nattracted considerable research attention in the past few years. Regarding\ngraph classification tasks, the graphs to be classified may have various graph\nsizes (i.e., different number of nodes and edges) and have various graph\nproperties (e.g., average node degree, diameter, and clustering coefficient).\nThe diverse property of graphs has imposed significant challenges on existing\ngraph learning techniques since diverse graphs have different best-fit\nhyperparameters. It is difficult to learn graph features from a set of diverse\ngraphs by a unified graph neural network. This motivates us to use a multiplex\nstructure in a diverse way and utilize a priori properties of graphs to guide\nthe learning. In this paper, we propose MxPool, which concurrently uses\nmultiple graph convolution/pooling networks to build a hierarchical learning\nstructure for graph representation learning tasks. Our experiments on numerous\ngraph classification benchmarks show that our MxPool has superiority over other\nstate-of-the-art graph representation learning methods.\n"
1,http://arxiv.org/abs/2103.00742v4,"Machine learning on graphs has been extensively studied in both academic and\nindustry. However, as the literature on graph learning booms with a vast number\nof emerging methods and techniques, it becomes increasingly difficult to\nmanually design the optimal machine learning algorithm for different\ngraph-related tasks. To solve this critical challenge, automated machine\nlearning (AutoML) on graphs which combines the strength of graph machine\nlearning and AutoML together, is gaining attention from the research community.\nTherefore, we comprehensively survey AutoML on graphs in this paper, primarily\nfocusing on hyper-parameter optimization (HPO) and neural architecture search\n(NAS) for graph machine learning. We further overview libraries related to\nautomated graph machine learning and in-depth discuss AutoGL, the first\ndedicated open-source library for AutoML on graphs. In the end, we share our\ninsights on future research directions for automated graph machine learning.\nThis paper is the first systematic and comprehensive review of automated\nmachine learning on graphs to the best of our knowledge.\n"
2,http://arxiv.org/abs/2201.01288v1,"Graph machine learning has been extensively studied in both academic and\nindustry. However, as the literature on graph learning booms with a vast number\nof emerging methods and techniques, it becomes increasingly difficult to\nmanually design the optimal machine learning algorithm for different\ngraph-related tasks. To tackle the challenge, automated graph machine learning,\nwhich aims at discovering the best hyper-parameter and neural architecture\nconfiguration for different graph tasks/data without manual design, is gaining\nan increasing number of attentions from the research community. In this paper,\nwe extensively discuss automated graph machine approaches, covering\nhyper-parameter optimization (HPO) and neural architecture search (NAS) for\ngraph machine learning. We briefly overview existing libraries designed for\neither graph machine learning or automated machine learning respectively, and\nfurther in depth introduce AutoGL, our dedicated and the world's first\nopen-source library for automated graph machine learning. Last but not least,\nwe share our insights on future research directions for automated graph machine\nlearning. This paper is the first systematic and comprehensive discussion of\napproaches, libraries as well as directions for automated graph machine\nlearning.\n"
3,http://arxiv.org/abs/2210.00437v1,"Graph coarsening is a widely used dimensionality reduction technique for\napproaching large-scale graph machine learning problems. Given a large graph,\ngraph coarsening aims to learn a smaller-tractable graph while preserving the\nproperties of the originally given graph. Graph data consist of node features\nand graph matrix (e.g., adjacency and Laplacian). The existing graph coarsening\nmethods ignore the node features and rely solely on a graph matrix to simplify\ngraphs. In this paper, we introduce a novel optimization-based framework for\ngraph dimensionality reduction. The proposed framework lies in the unification\nof graph learning and dimensionality reduction. It takes both the graph matrix\nand the node features as the input and learns the coarsen graph matrix and the\ncoarsen feature matrix jointly while ensuring desired properties. The proposed\noptimization formulation is a multi-block non-convex optimization problem,\nwhich is solved efficiently by leveraging block majorization-minimization,\n$\log$ determinant, Dirichlet energy, and regularization frameworks. The\nproposed algorithms are provably convergent and practically amenable to\nnumerous tasks. It is also established that the learned coarsened graph is\n$\epsilon\in(0,1)$ similar to the original graph. Extensive experiments\nelucidate the efficacy of the proposed framework for real-world applications.\n"
4,http://arxiv.org/abs/2302.02926v1,"Graph machine learning has been extensively studied in both academia and\nindustry. However, in the literature, most existing graph machine learning\nmodels are designed to conduct training with data samples in a random order,\nwhich may suffer from suboptimal performance due to ignoring the importance of\ndifferent graph data samples and their training orders for the model\noptimization status. To tackle this critical problem, curriculum graph machine\nlearning (Graph CL), which integrates the strength of graph machine learning\nand curriculum learning, arises and attracts an increasing amount of attention\nfrom the research community. Therefore, in this paper, we comprehensively\noverview approaches on Graph CL and present a detailed survey of recent\nadvances in this direction. Specifically, we first discuss the key challenges\nof Graph CL and provide its formal problem definition. Then, we categorize and\nsummarize existing methods into three classes based on three kinds of graph\nmachine learning tasks, i.e., node-level, link-level, and graph-level tasks.\nFinally, we share our thoughts on future research directions. To the best of\nour knowledge, this paper is the first survey for curriculum graph machine\nlearning.\n"


In [15]:
system = "You are a data science expert helping us extract relevant information."

# Set up the prompt for GPT-3 to complete
prompt = """#This is a research paper abstract. The task is to extract as many relevant entities to techniques, methods and applications.
#Also, return the type of an entity using the Wikipedia class system and the sentiment of the mentioned entity,
#where the sentiment value ranges from -1 to 1, and -1 being very negative, 1 being very positive
#Additionally, extract all relevant relationships between identified entities.
#The relationships should follow the Wikipedia schema type.
#The output of a relationship should be in a form of a triple Head, Relationship, Tail, for example
#Peter, WORKS_AT, Hospital/n
# An example "St. Peter is located in Paris" should have an output with the following format
entity
St. Peter, person, 0.0
Paris, location, 0.0

relationships
St.Peter, LOCATED_IN, Paris\n"""

In [16]:
def parse_entities_and_relationships(input_str):
    # Parse the input string
    entities = []
    relationships = []
    entity_mode = True
    # Skip the first line
    for line in input_str.split("\n")[1:]:
        if line == "relationships":
            entity_mode = False
        elif line:
            if entity_mode:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no entities are found
                if len(line.split(", ")) != 3:
                    continue
                entities.append(line.split(", "))
            else:
                # Make sure the rel is in correct format
                # GPT-4 sometimes returns n/a when no rels are found
                if len(line.split(", ")) != 3:
                    continue
                relationships.append(line.split(", "))
    return entities, relationships

In [25]:
@retry(tries=3, delay=5)
def process_gpt4(text):
    paragraph = text

    completion = openai.ChatCompletion.create(
        model="gpt-4",
        # Try to be as deterministic as possible
        temperature=0,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": prompt + paragraph},
        ],
    )

    nlp_results = completion.choices[0].message.content

    print(nlp_results)
    
    if not "relationships" in nlp_results:
        raise Exception(
            "GPT is not being nice and isn't returning results in correct format"
        )
    
    return parse_entities_and_relationships(nlp_results)

In [18]:
#entities, relationships = process_gpt4("""How to utilize deep learning methods for graph classification tasks has\nattracted considerable research attention in the past few years.""")

In [19]:
driver.execute_query(
    'create constraint if not exists for (n:Entity) require (n.name) is node key',
    None,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)
driver.execute_query(
    'create constraint if not exists for (n:Relationship) require (n.type) is node key',
    None,
    routing_= RoutingControl.WRITE,
    database_= DB_NAME
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x15f81fbb0>, keys=[])

In [26]:
for index, row in df_papers.iterrows():
    print("processing: {id}".format(id=row.id))
    entities, relationships = process_gpt4(row.summary)
    driver.execute_query(
    '''
    MATCH (p:Paper{id:$id}) set p.processed=datetime()
    FOREACH (e in $entities |
        MERGE (entity:Entity {name: e[0]})
        ON CREATE SET entity.type = e[1] 
        MERGE (p)-[:MENTIONS{sentiment:toFloat(e[2])}]->(entity)
    )
    WITH p
    UNWIND $relationships AS relation
    MERGE (source:Entity {name: relation[0]})
    MERGE (target:Entity {name: relation[2]})
    MERGE (r:Relationship {type: relation[1]})
    MERGE (source)-[:RELATIONSHIP]->(r)-[:RELATIONSHIP]->(target)
    MERGE (p)-[mr:MENTIONS_RELATIONSHIP]->(r)
    ''',
    id = row.id,
    relationships = relationships,
    entities = entities,
    routing_ = RoutingControl.WRITE,
    database_ = DB_NAME
)

processing: http://arxiv.org/abs/2004.06846v1
entities
Deep learning methods, technique, 0.5
Graph classification tasks, application, 0.5
Graph sizes, property, 0.0
Graph properties, property, 0.0
Average node degree, property, 0.0
Diameter, property, 0.0
Clustering coefficient, property, 0.0
Graph learning techniques, technique, 0.0
Hyperparameters, method, 0.0
Graph features, property, 0.0
Unified graph neural network, technique, 0.0
Multiplex structure, method, 0.0
A priori properties, method, 0.0
MxPool, technique, 0.5
Multiple graph convolution/pooling networks, technique, 0.5
Hierarchical learning structure, method, 0.5
Graph representation learning tasks, application, 0.5
Graph classification benchmarks, application, 0.0
State-of-the-art graph representation learning methods, technique, 0.5

relationships
Deep learning methods, USED_FOR, Graph classification tasks
Graph sizes, PROPERTY_OF, Graphs
Graph properties, PROPERTY_OF, Graphs
Average node degree, PROPERTY_OF, Graphs
Diam