In [70]:
from py2neo import Graph
import pandas as pd

pd.set_option('display.max_colwidth', -1)

In [52]:
graph = Graph("bolt://localhost", auth=("neo4j", "neo"))

Check the package is installed:

In [53]:
query = """
CALL dbms.procedures() 
YIELD name WHERE name CONTAINS 'regr' 
RETURN *
"""

graph.run(query).to_data_frame()

Unnamed: 0,name
0,regression.linear.add
1,regression.linear.addM
2,regression.linear.clear
3,regression.linear.copy
4,regression.linear.create
5,regression.linear.delete
6,regression.linear.info
7,regression.linear.load
8,regression.linear.remove
9,regression.linear.removeM


Let's split training and test data:

In [54]:
split_data_train_query = """
MATCH (list:Listing)-[:IN_NEIGHBORHOOD]->(:Neighborhood {id:$neighborhoodId}) 
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
WITH regression.linear.split(collect(id(list)), 0.75) AS trainingIDs
MATCH (list:Listing) WHERE id(list) in trainingIDs 
SET list:Train
"""

split_data_test_query = """
MATCH (list:Listing)-[n:IN_NEIGHBORHOOD]->(hood:Neighborhood {id:$neighborhoodId})
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
AND NOT list:Train 
SET list:Test
"""

graph.run(split_data_train_query, {"neighborhoodId": "78704"}).summary().counters
graph.run(split_data_test_query, {"neighborhoodId": "78704"}).summary().counters

{}

In [55]:
correlation_query = """
MATCH (list) 
WHERE list:Test OR list:Train
WITH collect(size((list)<-[:REVIEWS]-()) * 1.0) AS reviews,
     collect(list.bedrooms + list.bathrooms) as rooms
RETURN regression.linear.correlation(reviews, rooms)
"""

graph.run(correlation_query).to_data_frame()

Unnamed: 0,"regression.linear.correlation(reviews, rooms)"
0,-0.124958


In [60]:
init_query = """
CALL regression.linear.create('mlr rental prices', 'Multiple', true, 2)
"""

graph.run(init_query).summary().counters

ClientError: Procedure Call Failed: Failed to invoke procedure `regression.linear.create`: Caused by: java.lang.IllegalArgumentException: Model mlr rental prices already exists, please remove it first

In [61]:
add_training_data_query = """
MATCH (list:Train)
WHERE NOT list:Seen 
CALL regression.linear.add('mlr rental prices', 
  [list.bedrooms + list.bathrooms, list.num_reviews], 
  list.price
) 
SET list:Seen RETURN count(list)
"""

graph.run(add_training_data_query).summary().counters

{'labels_added': 208}

In [62]:
train_model_query = """
CALL regression.linear.train('mlr rental prices')
"""

graph.run(train_model_query)

<py2neo.database.Cursor at 0x12057c438>

In [72]:
add_test_data_query = """
MATCH (list:Test) 
WHERE NOT list:Seen
CALL regression.linear.add('mlr rental prices', 
  [list.bedrooms + list.bathrooms, list.num_reviews],  
  list.price, 
  'test'
) 
SET list:Seen 
RETURN count(list)
"""

In [73]:
test_model_query = """
CALL regression.linear.test('mlr rental prices')
"""

graph.run(test_model_query).to_data_frame()

Unnamed: 0,framework,hasConstant,model,nTest,nTrain,numVars,state,testInfo,trainInfo
0,Multiple,True,mlr rental prices,0,1030,2,ready,"{'adjRSquared': nan, 'RSquared': nan, 'SSE': 0.0, 'SST': 0.0, 'MSE': -0.0}","{'RSquared': 0.5258024006485829, 'SSR': 18093522.237261645, 'SSE': 16317736.088951947, 'SST': 34411258.32621359, 'adjRSquared': 0.5248789389166424, 'parameters std error': [9.300995353372807, 2.606208579807965, 0.1307049259955841], 'parameters': [-11.466731296692997, 84.31764247038417, -0.7478547779553181], 'MSE': 15888.740106087582}"
