# Quora Questions with Neo4j

Here I'll be using the Neo4j apis to manipulate some text data.

In [1]:
# The usual imports
import os, re, matplotlib, seaborn
import functools as func
import itertools as it
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
seaborn.set()

First, I have to import some packages and establish a connection to the database.

In [2]:
from py2neo import Graph, Node

In [3]:
%load_ext cypher

In [5]:
graph = Graph("C:\\Users\\caleb\\Documents\\Neo4j\\quora_questions.graphdb",
              password="yCHgrQeDcDOeIVXU")

# Delete all nodes from the graph
graph.delete_all()

In [21]:
def create_nodes(nodes):
    nodes = list(nodes)
    batch_cnt = 10
    batchs = {i : [node for j, node in enumerate(nodes) if j % batch_cnt == i] for i in range(batch_cnt)}
    
    for i in batchs:
        graph.create(func.reduce(lambda n1, n2: n1 | n2, batchs[i]))

We begin by creating one node for every token in the tokens set.

In [6]:
all_tokens = pd.read_csv("data/tokens.csv")
all_tokens = list(all_tokens.tokens)

In [22]:
nodes = map(lambda t: Node("Word", text=t), all_tokens)
%time create_nodes(nodes)

Wall time: 3min 46s


Next, we can create an index on the Word nodes after asserting that the text values are also all unique.

In [24]:
graph.run("CREATE INDEX on :Word(text)")

<py2neo.database.Cursor at 0x18b4bcf8cc0>

In [50]:
def create_relationships(tups):
    tups = list(tups)
    batch_cnt = 100
    batchs = {i : [tup for j, tup in enumerate(tups) if j % batch_cnt == i] for i in range(batch_cnt)}    
    for key in batchs:
        progress = "\rProcessing batch %d out of %d" % (key + 1, batch_cnt) 
        batch = batchs[key]
        queries = []
        nodes = 1; edges = 1
        for tup in batch:
            val = tup[0]
            token1 = all_tokens[tup[1]]
            token2 = all_tokens[tup[2]]
            print(progress + " (%s, %s)" % (token1, token2), end="")
            query = [
            "MATCH (w%d:Word),(w%d:Word)"                  % (nodes, nodes + 1),
            'WHERE w%d.text = "%s" AND w%d.text = "%s"'    % (nodes, token1, nodes + 1, token2),
            "CREATE (w%d)-[o%d:OCCURS_WITH]->(w%d)"        % (nodes, edges, nodes),
            "SET o%d.weight = %d"                          % (edges, val),
            "WITH w%d, w%d"                                % (nodes, nodes + 1),
            "RETURN w%d"                                   % (nodes),
            ] 
            nodes += 2
            edges += 1
            query = '\n'.join(query)
            graph.run(query)

In [64]:
# Delete all relationships
query = """MATCH (n)
           DELETE n"""
graph.run(query)

<py2neo.database.Cursor at 0x202782cab70>

In [13]:
rels = pd.read_csv("data/relationships.csv")
tups = zip(rels.freqs, rels.word1, rels.word2)

In [56]:
create_relationships(tups)

Processing batch 1 out of 100 (lift, do)

KeyboardInterrupt: 

In [57]:
len(rels)

1741966

In [58]:
np.sum(rels.freqs)

6132054

Now I just need to check that I made all of the relationships correctly.

In [63]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
MATCH (n)
RETURN DISTINCT labels(n)

Code [200]: OK. Request fulfilled, document follows.

Neo.ClientError.Request.InvalidFormat:
Unable to deserialize request: Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow
 at [Source: HttpInputOverHTTP@70885ca4; line: 1, column: 623099]



In [42]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
MATCH (n)-[r]-()
RETURN DISTINCT type(r)

Code [200]: OK. Request fulfilled, document follows.

Neo.ClientError.Request.InvalidFormat:
Unable to deserialize request: Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow
 at [Source: HttpInputOverHTTP@378d8c96; line: 1, column: 743763]



In [43]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
MATCH (n)-[r]-()
RETURN DISTINCT n.text as text, count(r) as freq
ORDER BY freq DESC LIMIT 10

Code [200]: OK. Request fulfilled, document follows.

Neo.ClientError.Request.InvalidFormat:
Unable to deserialize request: Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow
 at [Source: HttpInputOverHTTP@378d8c96; line: 1, column: 744137]



Wait - does this mean that it only made 3018 relationships?

In [44]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
MATCH (n)-[r]-()
RETURN sum(r.weight)

Code [200]: OK. Request fulfilled, document follows.

Neo.ClientError.Request.InvalidFormat:
Unable to deserialize request: Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow
 at [Source: HttpInputOverHTTP@378d8c96; line: 1, column: 744337]



ok - so it made 150516 weights for some reason. This is not everything.

In [64]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data 
start n=node(*)
match n-[r]-()
return n, count(r) as rel_count
order by rel_count desc

Code [200]: OK. Request fulfilled, document follows.

Neo.ClientError.Request.InvalidFormat:
Unable to deserialize request: Non-standard token 'NaN': enable JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS to allow
 at [Source: HttpInputOverHTTP@37cdf59a; line: 1, column: 599118]



## Load CSV

Here is a query that I wrote that worked once. I'm just keeping it here in case I want to write something similar later.

In [None]:
%%cypher http://neo4j:yCHgrQeDcDOeIVXU@localhost:7474/db/data
LOAD CSV WITH HEADERS FROM 'file:///data/tokens.csv' AS line
WITH line.token as token
MERGE (x:Token {text:token})

In [18]:
min([len(str(t)) for t in all_tokens])

1

In [24]:
rels.word2.apply(lambda x: len(str(x))).min()

1

In [None]:
# this would work if there were no null values

USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM 
'file:///data/relationships.csv' AS line
WITH line.freqs as freq, line.word1 as word1, line.word2 as word2
MERGE (w1:Token {text: word1})
MERGE (w2:Token {text: word2})
CREATE (w1)-[r:OCCURS_WITH]->(w2)
SET r.weight = freq

In [None]:
MATCH (w1)-[r1:OCCURS_WITH]->(w2)
CREATE (w2)-[r2:OCCURS_WITH]->(w1)
SET r2.weight = r1.weight