In [45]:
from py2neo import Graph
import pandas as pd

pd.set_option('display.max_colwidth', -1)

In [72]:
graph = Graph("bolt://localhost", auth=("neo4j", "neo"))

For this section we need to install the [ml-models](https://github.com/neo4j-graph-analytics/ml-models) procedures library. You can find installation instructions on the [releases pages](https://github.com/neo4j-graph-analytics/ml-models/releases/tag/1.0.1). 

Once you've done that, run the following code to check that the library is installed:

In [47]:
query = """
CALL dbms.procedures() 
YIELD name WHERE name STARTS WITH 'regression' 
RETURN *
"""

graph.run(query).to_data_frame()

Unnamed: 0,name
0,regression.linear.add
1,regression.linear.addM
2,regression.linear.clear
3,regression.linear.copy
4,regression.linear.create
5,regression.linear.delete
6,regression.linear.info
7,regression.linear.load
8,regression.linear.remove
9,regression.linear.removeM


Let's split training and test data:

In [48]:
split_data_train_query = """
MATCH (list:Listing)-[:IN_NEIGHBORHOOD]->(:Neighborhood) 
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
WITH regression.linear.split(collect(id(list)), 0.75) AS trainingIDs
MATCH (list:Listing) WHERE id(list) in trainingIDs 
SET list:Train
"""

split_data_test_query = """
MATCH (list:Listing)-[n:IN_NEIGHBORHOOD]->(:Neighborhood)
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
AND NOT list:Train 
SET list:Test
"""

graph.run(split_data_train_query).summary().counters
graph.run(split_data_test_query).summary().counters

{}

In [50]:
correlation_query = """
MATCH (list) 
WHERE list:Test OR list:Train
WITH collect(size((list)<-[:REVIEWS]-()) * 1.0) AS reviews,
     collect(list.bedrooms + list.bathrooms) as rooms
RETURN regression.linear.correlation(reviews, rooms)
"""

graph.run(correlation_query).to_data_frame()

Unnamed: 0,"regression.linear.correlation(reviews, rooms)"
0,0.026142


In [51]:
model_name = "rental-prices-4"

In [52]:
init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, 2)
"""

graph.run(init_query, {"modelName": model_name}).summary().counters

{}

In [53]:
add_training_data_query = """
MATCH (list:Train)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0], 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_training_data_query, {"modelName": model_name}).summary().counters

{'labels_added': 36790}

In [54]:
train_model_query = """
CALL regression.linear.train($modelName)
"""

graph.run(train_model_query, {"modelName": model_name})

<py2neo.database.Cursor at 0x1156d6a90>

In [55]:
add_test_data_query = """
MATCH (list:Test) 
WHERE NOT list:Seen
CALL regression.linear.add($modelName, 
  [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],  
  list.price, 
  'test'
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_test_data_query, {"modelName": model_name})

In [56]:
test_model_query = """
CALL regression.linear.test($modelName)
"""

graph.run(test_model_query, {"modelName": model_name}).to_data_frame()

Unnamed: 0,framework,hasConstant,model,nTest,nTrain,numVars,state,testInfo,trainInfo
0,Multiple,True,rental-prices-4,0,36790,2,ready,"{'adjRSquared': nan, 'RSquared': nan, 'SSE': 0.0, 'SST': 0.0, 'MSE': -0.0}","{'RSquared': 0.22357460363988557, 'SSR': 88897254.99853414, 'SSE': 308720602.9837708, 'SST': 397617857.98230493, 'adjRSquared': 0.22353239169564654, 'parameters std error': [1.2824171688063868, 0.5109515890197222, 0.011533589063208016], 'parameters': [16.223217245130986, 52.5881409596996, -0.034282521063706], 'MSE': 8392.111424790572}"


In [57]:
info_query = """
CALL regression.linear.info($modelName) 
"""

graph.run(info_query, {"modelName": model_name}).to_data_frame()

Unnamed: 0,framework,hasConstant,model,nTest,nTrain,numVars,state,testInfo,trainInfo
0,Multiple,True,rental-prices-4,0,36790,2,ready,"{'adjRSquared': nan, 'RSquared': nan, 'SSE': 0.0, 'SST': 0.0, 'MSE': -0.0}","{'RSquared': 0.22357460363988557, 'SSR': 88897254.99853414, 'SSE': 308720602.9837708, 'SST': 397617857.98230493, 'adjRSquared': 0.22353239169564654, 'parameters std error': [1.2824171688063868, 0.5109515890197222, 0.011533589063208016], 'parameters': [16.223217245130986, 52.5881409596996, -0.034282521063706], 'MSE': 8392.111424790572}"


Now let's add the neighborhood to the list of independent variables that we feed to our regression model. Neighborhoods are categorical variables so we'll need to create a [one hot encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f).

We can use the `algo.ml.oneHotEncoding` function to help us out. 

In [73]:
model_name = "rental-prices-6"

In [85]:
nh_count_query = """
MATCH (:Neighborhood)
RETURN count(*) AS count
"""

nh_count =  graph.run(nh_count_query).to_table()[0][0]


init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, $numberOfVariables)
"""

graph.run(init_query, {"modelName": model_name, "numberOfVariables": 2 + nh_count}).summary().counters

{}

Before we create our new model let's remove the `Seen` label from our nodes so that we can process them again:

In [86]:
clear_seen_query = """
MATCH (s:Seen)
REMOVE s:Seen
"""

graph.run(clear_seen_query)

<py2neo.database.Cursor at 0x1158a0da0>

In [87]:
add_training_data_query = """
MATCH (nh:Neighborhood)
WITH collect(nh) AS neighborhoods
MATCH (list:Train)-[:IN_NEIGHBORHOOD]->(nh)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(neighborhoods, [nh])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_training_data_query, {"modelName": model_name}).summary().counters

{'labels_added': 36790}

In [88]:
train_model_query = """
CALL regression.linear.train($modelName)
"""

graph.run(train_model_query, {"modelName": model_name})

<py2neo.database.Cursor at 0x1158a0588>

In [90]:
add_test_data_query = """
MATCH (nh:Neighborhood)
WITH collect(nh) AS neighborhoods
MATCH (list:Test)-[:IN_NEIGHBORHOOD]->(nh)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(neighborhoods, [nh])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_test_data_query, {"modelName": model_name})

<py2neo.database.Cursor at 0x1156ff5f8>

In [91]:
test_model_query = """
CALL regression.linear.test($modelName)
"""

graph.run(test_model_query, {"modelName": model_name}).to_data_frame()

Unnamed: 0,framework,hasConstant,model,nTest,nTrain,numVars,state,testInfo,trainInfo
0,Multiple,True,rental-prices-6,0,39258,226,ready,"{'adjRSquared': nan, 'RSquared': nan, 'SSE': 0.0, 'SST': 0.0, 'MSE': -0.0}","{'RSquared': 0.43894050872094714, 'SSR': 174530584.85928622, 'SSE': 223087273.1230187, 'SST': 397617857.98230493, 'adjRSquared': 0.4354725371368575, 'parameters std error': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, ...], 'parameters': [1933160947024.3452, 55.174439596012235, -0.010862166411243379, -1933160947035.2078, -1933160947067.2014, -1933160946915.2056, -1933160947014.6123, -1933160947013.3208, -1933160946972.9734, -1933160946956.7898, -1933160946952.912, -1933160946977.8726, -1933160946892.5352, -1933160946992.573, -1933160947026.003, -1933160946920.7776, -1933160947033.0295, -1933160947004.9907, -1933160946929.8118, -1933160947052.7478, -1933160947006.2793, -1933160947058.3604, -1933160947046.312, -1933160947053.9624, -1933160947022.6423, -1933160947071.773, -1933160947064.875, -1933160946986.4749, -1933160947052.168, -1933160947037.1812, -1933160947021.1914, -1933160947067.864, -1933160946952.115, -1933160946914.5488, -1933160947015.292, -1933160947059.8057, -1933160946975.7998, -1933160947058.611, -1933160946961.2788, -1933160946937.1016, -1933160947001.3633, -1933160947058.1406, -1933160946996.9077, -1933160946987.5386, -1933160947058.0098, -1933160946995.38, -1933160947069.0007, -1933160947061.7097, -1933160947074.5142, -1933160947055.3613, -1933160946974.1426, -1933160947080.6074, -1933160947073.5188, -1933160946960.284, -1933160947082.1733, -1933160947036.6235, -1933160947054.6272, -1933160947037.928, -1933160946887.5054, -1933160947049.8994, -1933160947051.1592, -1933160946981.4597, -1933160947067.5964, -1933160947081.8743, -1933160946957.997, -1933160947046.283, -1933160947054.603, -1933160947053.3105, -1933160946941.567, -1933160947059.0737, -1933160946881.9106, -1933160947081.6172, -1933160947029.0896, -1933160947040.0947, -1933160946943.8286, -1933160947078.623, -1933160947057.4448, -1933160947066.9792, -1933160947079.9795, -1933160946920.9644, -1933160947056.9712, -1933160947068.5488, -1933160947047.2517, -1933160947042.5247, -1933160947065.5505, -1933160947043.4636, -1933160947073.4536, -1933160947065.827, -1933160947076.198, -1933160947061.3342, -1933160947074.2551, -1933160947001.9417, -1933160947073.8357, -1933160947074.8308, -1933160947059.3008, -1933160947081.0752, -1933160947061.145, -1933160947097.5498, -1933160947050.8086, -1933160947058.4495, ...], 'MSE': 6101.448817739756}"
