In [21]:
%reset

import pandas as pd
from lxml import etree #for XML manipulation
import py2neo as neo
import re
from py2neo import Graph, Node

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [22]:
#path to xml file as its saved outside my git repo:
path = '../../../DataFiles/dblp.xml'

In [23]:
#this function returns (yields) a generator, so when the function is called,
#it is not run. This is so that we don't have to store the data in memory, we
#can simply work with it and discard it
def iterate_xml(path):
    doc = etree.iterparse(path,events=('start','end'),dtd_validation=True)
    _, root = next(doc)
    start_tag = None
    for event, element in doc:
        if event == 'start' and start_tag is None:
            start_tag = element.tag
        if event == 'end' and element.tag == start_tag:
            yield element
            start_tag = None
            root.clear()

Connect to neo4j server

In [24]:
#note, server must be running on localhost 
#sudo service neo4j start
graph = Graph('bolt://localhost:7687', auth=('neo4j','ItIsABadIdeaToHardCodePasswords'))

In [None]:
#ONLY NEED TO RUN ONCE
#Set Uniqueness Constraints on Nodes
graph.schema.create_uniqueness_constraint('Article', 'key')
graph.schema.create_uniqueness_constraint('Author', 'name')

In [25]:
#create headers on csv files
with open('./data/articles.csv','w') as file:
    line = 'key,title\n'
    file.write(line)

with open('./data/authors.csv','w') as file:
    line = 'name,university\n'
    file.write(line)

with open('./data/published.csv','w') as file:
    line = 'name,key\n'
    file.write(line)

In [26]:
#reset generator
generator = iterate_xml(path)

#delete all nodes, used when testing out the program to start fresh each time
graph.delete_all()

n_lines = 60511260
approx_n_events = n_lines/9

i = 0
for event in generator:
    #print('_______________________')
    key = str(event.attrib['key'])
    #print('Key: ' + key)
    try:
        xml = etree.tostring(event.find('title')).decode('utf-8') 
        title = re.search('<title>(.+?)</title>',xml).group(1)
    except (AttributeError,TypeError):
        continue
    #print(title)
    
    university = 'None'
    if (event.tag == 'phdthesis') or (event.tag == 'mastersthesis'):
        try:
            xml = etree.tostring(event.find('school')).decode('utf-8')
            university = re.search('<school>(.+?)</school>',xml).group(1)
            #print(school)
        except (AttributeError,TypeError):
            university = 'None'
    
    #save article node data
    with open('./data/articles.csv','a') as csv_file:
        line = key+','+title
        csv_file.write(line)
        csv_file.write('\n')
    
    auth_list = event.findall('author')    
    for author in auth_list:
        try:
            xml = etree.tostring(author).decode('utf-8')
            name = re.search('<author>(.+?)</author>',xml).group(1)
            #print(name)
        except (AttributeError,TypeError):
            continue
         
        #save author node data
        with open('./data/authors.csv','a') as csv_file:
            line = name+','+university
            csv_file.write(line)
            csv_file.write('\n')
        
        #save relationship data
        with open('./data/published.csv','a') as csv_file:
            line = name+','+key
            csv_file.write(line)
            csv_file.write('\n')

        
    i = i+1
    if i%100000==0:
        percent = (i/approx_n_events)*100
        print('Extracting: '+str(percent)+'%')
#    if i>30:
#        break

Extracting: 1.487326490970441%
Extracting: 2.974652981940882%
Extracting: 4.461979472911323%
Extracting: 5.949305963881764%
Extracting: 7.436632454852204%
Extracting: 8.923958945822646%
Extracting: 10.411285436793086%
Extracting: 11.898611927763527%
Extracting: 13.38593841873397%
Extracting: 14.873264909704409%
Extracting: 16.36059140067485%
Extracting: 17.84791789164529%
Extracting: 19.33524438261573%
Extracting: 20.822570873586173%
Extracting: 22.309897364556612%
Extracting: 23.797223855527054%
Extracting: 25.284550346497497%
Extracting: 26.77187683746794%
Extracting: 28.259203328438375%
Extracting: 29.746529819408817%
Extracting: 31.23385631037926%
Extracting: 32.7211828013497%
Extracting: 34.208509292320144%
Extracting: 35.69583578329058%
Extracting: 37.18316227426102%
Extracting: 38.67048876523146%
Extracting: 40.15781525620191%
Extracting: 41.645141747172346%
Extracting: 43.132468238142785%
Extracting: 44.619794729113224%
Extracting: 46.10712122008367%
Extracting: 47.594447711054

At this point, copy the generated files to the neo4j folder

In [31]:
#import articles
graph.delete_all()
graph.run('USING PERIODIC COMMIT 500 LOAD CSV WITH HEADERS FROM "file:///articles.csv" AS csvLine CREATE (a:Article {key: csvLine.key, title: csvLine.title})')

<py2neo.graph.Cursor at 0x7fb65ca92940>

In [None]:
graph.run('USING PERIODIC COMMIT 500 LOAD CSV WITH HEADERS FROM "file:///authors.csv" AS csvLine MERGE (a:Author {name: csvLine.name})')

In [33]:
graph.run('USING PERIODIC COMMIT 500 LOAD CSV WITH HEADERS FROM "file:///published.csv" AS csvLine MATCH (author:Author {name: csvLine.name}),(article:Article {key: csvLine.key}) CREATE (author)-[:PUBLISHED]->(article)')

<py2neo.graph.Cursor at 0x7fb65ca4b780>

In [35]:
#forgot to add universities so had to add them like this
graph.run('USING PERIODIC COMMIT 500 LOAD CSV WITH HEADERS FROM "file:///authors.csv" AS csvLine MATCH (a:Author {name: csvLine.name}) SET a.university = csvLine.university')

<py2neo.graph.Cursor at 0x7fb65ca4b630>