In [1]:
!pip install neo4j spacy
!python -m spacy download en_core_web_lg

Collecting neo4j
  Using cached neo4j-4.4.5.tar.gz (95 kB)
Collecting spacy
  Downloading spacy-3.3.1-cp38-cp38-win_amd64.whl (12.1 MB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Using cached spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.7-cp38-cp38-win_amd64.whl (18 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Using cached spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.8-cp38-cp38-win_amd64.whl (6.6 MB)
Collecting typer<0.5.0,>=0.3.0
  Using cached typer-0.4.2-py3-none-any.whl (27 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting wasabi<1.1.0,>=0.9.1
  Using cached wasabi-0.9.1-py3-none-any.whl (26 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp38-cp38-win_amd64.whl (113 kB)
Collecting pathy>=0.3.5
  Using cached pathy-0.6.2-py3-none-any.whl (42 kB)
Collecting catalogue<2.1.0,

Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


# Data preprocessing

In [None]:
# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

# Fetch the data
target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
import urllib.request
data = urllib.request.urlopen(target_url)
raw_data = data.read().decode('utf8').strip()

# Preprocess text into chapters 
import re
chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]
chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]

# Import into Neo4j

In [3]:
# import spacy and load an NLP model
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser"])

In [4]:
# Import Neo4j and define cypher queries
import neo4j
host = 'bolt://localhost:7687'
user = 'neo4j'
password = 'yousef'

driver = neo4j.GraphDatabase.driver(host, auth=(user, password))

save_query ="""
MERGE (p1:Person{name:$name1})
MERGE (p2:Person{name:$name2})
MERGE (p1)-[r:RELATED]-(p2)
ON CREATE SET r.score = 1
ON MATCH SET r.score = r.score + 1"""

constraint_query="CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;"

In [5]:
# Run the analysis of the first chapter
c = chapters[0]
# Get involved
doc=nlp(c)

with driver.session() as session:
    #define constraint
    session.run(constraint_query)
    # Extract Person labels
    involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
    # Preprocess text
    decode = dict()
    for i,x in enumerate(involved):
        # Get mapping
        decode['$${}$$'.format(i)] = x
        # Preprocess text
        c = c.replace(x,' $${}$$ '.format(i))
        
    # Split chapter into words
    ws = c.split()
    l = len(ws)
    # Iterate through words
    for wi,w in enumerate(ws):
        # Skip if the word is not a person
        if not w[:2] == '$$':
            continue
        # Check next x words for any involved person
        x = 14
        for i in range(wi+1,wi+x):
            # Avoid list index error
            if i >= l:
                break
            # Skip if the word is not a person
            if not ws[i][:2] == '$$':
                continue
            # Store to Neo4j
            params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}
            session.run(save_query, params)
            print(decode[ws[wi]],decode[ws[i]])



Rudolf    Rose
Rassendyll Robert
Robert Robert  
Rudolf Rose
Rudolf Robert  
Robert   Rudolf   
Lady Burlesdon George II
Burlesdon Rassendyll
Rassendyll Garter
Garter Rudolf
Jacob Jacob       
Jacob        Rudolf
Bob Rose


# Graph Analysis

In [6]:
# Project the graph
graph_projection = """
CALL gds.graph.project('ch1', 'Person', {RELATED:{orientation:'UNDIRECTED'}})
"""

# Run pagerank and louvain algorithm
pagerank ="""
CALL gds.pageRank.write('ch1',{writeProperty:'pagerank'})
"""
louvain = """
CALL gds.louvain.write('ch1',{writeProperty:'louvain'})
"""

drop_graph = """
CALL gds.graph.drop('ch1')
"""

with driver.session() as session:
    session.run(graph_projection)
    session.run(pagerank)
    session.run(louvain)
    session.run(drop_graph)

# Results

In [7]:
# Import libraries
import pandas as pd

def read_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [8]:
# Evaluate pagerank
read_query("""
MATCH (c:Person)
RETURN c.name AS character, c.pagerank AS score
ORDER BY score DESC LIMIT 5
""")

Unnamed: 0,character,score
0,Rudolf,1.623691
1,Rassendyll,1.31845
2,Rose,1.284447
3,Robert,1.216617
4,Jacob,0.962938


In [9]:
# Evaluate louvain
read_query("""
MATCH (c:Person)
RETURN c.louvain AS community, collect(c.name) AS members
ORDER BY size(members) DESC
""")

Unnamed: 0,community,members
0,4,"[Rassendyll, Robert, Robert , Burlesdon]"
1,9,"[Rudolf, Garter, Jacob, Jacob ]"
2,1,"[Rudolf , Rose, Bob]"
3,7,"[Lady Burlesdon, George II]"
