# Predictions

In this notebook we'll learn how to predict the prices of short term rental listings using a linear regression model.

In [None]:
%matplotlib notebook

from py2neo import Graph
import pandas as pd

import matplotlib 
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

pd.set_option('display.max_colwidth', -1)

In [None]:
graph = Graph("bolt://localhost", auth=("neo4j", "neo"))

For this section we need to install the [ml-models](https://github.com/neo4j-graph-analytics/ml-models) procedures library. You can find installation instructions on the [releases pages](https://github.com/neo4j-graph-analytics/ml-models/releases/tag/1.0.1). 

Once you've done that, run the following code to check that the library is installed:

In [None]:
query = """
CALL dbms.procedures() 
YIELD name WHERE name STARTS WITH 'regression' 
RETURN *
"""

graph.run(query).to_data_frame()

We're going to predict the prices for listings within one New York neighborhood. We want to pick a neighborhood that has a good number of listings so let's first write a query to find a good neighborhood to use:

In [None]:
query = """
MATCH (nh:Neighborhood)<-[:IN_NEIGHBORHOOD]-()
RETURN nh.name AS nh, count(*) AS listings
ORDER BY listings DESC
LIMIT 10
"""

graph.run(query).to_data_frame()

Before we do any predictions let's get an overview of the prices in Williamsburg. We can create a scatterplot to help us do this:

In [None]:
# amazing scatterplot

query = """
MATCH (nh:Neighborhood {name: $name})<-[:IN_NEIGHBORHOOD]-(listing)
RETURN listing.price AS price
"""

place = "Williamsburg"
df = graph.run(query, {"name": place}).to_data_frame()
df["price"].describe()

In [None]:
plt.hist(df["price"].dropna(), 20, density=True, facecolor='g', alpha=0.75)
plt.title(f"Prices in {place}")
plt.tight_layout()

This looks like a classic long tail distribution - the massive majority of listings are prices below $200 and then there are a few outliers at much higher price brackets.

Let's split training and test data:

In [None]:
split_data_train_query = """
MATCH (list:Listing)-[:IN_NEIGHBORHOOD]->(:Neighborhood {name: $name}) 
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
WITH regression.linear.split(collect(id(list)), 0.75) AS trainingIDs
MATCH (list:Listing) WHERE id(list) in trainingIDs 
SET list:Train
"""

split_data_test_query = """
MATCH (list:Listing)-[n:IN_NEIGHBORHOOD]->(:Neighborhood {name: $name})
WHERE exists(list.bedrooms) AND exists(list.bathrooms)
AND exists(list.price) 
AND (:Review)-[:REVIEWS]->(list) 
AND NOT list:Train 
SET list:Test
"""

graph.run(split_data_train_query, {"name": place}).summary().counters
graph.run(split_data_test_query, {"name": place}).summary().counters

In [None]:
correlation_query = """
MATCH (list) 
WHERE list:Test OR list:Train
WITH collect(size((list)<-[:REVIEWS]-()) * 1.0) AS reviews,
     collect(list.bedrooms + list.bathrooms) as rooms
RETURN regression.linear.correlation(reviews, rooms)
"""

graph.run(correlation_query).to_data_frame()

In [None]:
model_name = "rental-prices-gc2"

In [None]:
init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, 2)
"""

graph.run(init_query, {"modelName": model_name}).summary().counters

In [None]:
add_training_data_query = """
MATCH (list:Train)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0], 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_training_data_query, {"modelName": model_name}).summary().counters

In [None]:
train_model_query = """
CALL regression.linear.train($modelName)
"""

graph.run(train_model_query, {"modelName": model_name}).to_data_frame()

In [None]:
add_test_data_query = """
MATCH (list:Test) 
WHERE NOT list:Seen
CALL regression.linear.add($modelName, 
  [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],  
  list.price, 
  'test'
) 
SET list:Seen 
RETURN count(list)
"""

graph.run(add_test_data_query, {"modelName": model_name}).data()

In [None]:
test_model_query = """
CALL regression.linear.test($modelName)
"""

graph.run(test_model_query, {"modelName": model_name}).to_data_frame()

In [None]:
info_query = """
CALL regression.linear.info($modelName) 
"""

graph.run(info_query, {"modelName": model_name}).to_data_frame()

Let's add some more features to our model. So far we've only added numerical properties - what if we want to add a categorical variable such as `propertyType`?

To work with these types of variables we'll need to create a [one hot encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f) of property types.

We can use the `algo.ml.oneHotEncoding` function to help us out. 

In [None]:
model_name = "rental-prices-propertyType"

property_type_count_query = """
match (l:Listing)
WITH l.propertyType AS propertyType, count(*) AS count
RETURN count(*) AS count
"""

property_type_count =  graph.run(property_type_count_query).to_table()[0][0]


init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, $numberOfVariables)
"""

result = graph.run(init_query, {"modelName": model_name, "numberOfVariables": 2 + property_type_count})
display(result.summary().counters)

clear_seen_query = """
MATCH (s:Seen)
REMOVE s:Seen
"""

display(graph.run(clear_seen_query).summary().counters)

add_training_data_query = """
match (l:Listing)
WITH l.propertyType AS propertyType, count(*) AS count
WITH collect(propertyType) AS propertyTypes

MATCH (list:Train)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(propertyTypes, [list.propertyType])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_training_data_query, {"modelName": model_name}).summary().counters)

train_model_query = """
CALL regression.linear.train($modelName)
"""

display(graph.run(train_model_query, {"modelName": model_name}).to_data_frame())

add_test_data_query = """
match (l:Listing)
WITH l.propertyType AS propertyType, count(*) AS count
WITH collect(propertyType) AS propertyTypes

MATCH (list:Test)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(propertyTypes, [list.propertyType])
  ]), 
  list.price, "test"
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_test_data_query, {"modelName": model_name}).to_data_frame())

test_model_query = """
CALL regression.linear.test($modelName)
"""

display(graph.run(test_model_query, {"modelName": model_name}).to_data_frame())

We can do the same thing with roomType:

In [None]:
model_name = "rental-prices-roomType"

count_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
RETURN count(*) AS count
"""

count =  graph.run(count_query).to_table()[0][0]


init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, $numberOfVariables)
"""

result = graph.run(init_query, {"modelName": model_name, "numberOfVariables": 2 + count})
display(result.summary().counters)

clear_seen_query = """
MATCH (s:Seen)
REMOVE s:Seen
"""

display(graph.run(clear_seen_query).summary().counters)

add_training_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Train)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_training_data_query, {"modelName": model_name}).summary().counters)

train_model_query = """
CALL regression.linear.train($modelName)
"""

display(graph.run(train_model_query, {"modelName": model_name}).to_data_frame())

add_test_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Test)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, size((list)<-[:REVIEWS]-()) * 1.0],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price, "test"
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_test_data_query, {"modelName": model_name}).to_data_frame())

test_model_query = """
CALL regression.linear.test($modelName)
"""

display(graph.run(test_model_query, {"modelName": model_name}).to_data_frame())

Another feature that we could use is whether the host is a super host. Super hosts get benefits which give them greater visibility on Airbnb:

In [None]:
model_name = "rental-prices-roomType-superHost"

count_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
RETURN count(*) AS count
"""

count =  graph.run(count_query).to_table()[0][0]


init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, $numberOfVariables)
"""

result = graph.run(init_query, {"modelName": model_name, "numberOfVariables": 3 + count})
display(result.summary().counters)

clear_seen_query = """
MATCH (s:Seen)
REMOVE s:Seen
"""

display(graph.run(clear_seen_query).summary().counters)

add_training_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Train)<-[:HOSTS]-(host)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, 
     size((list)<-[:REVIEWS]-()) * 1.0,
     CASE WHEN host.superhost THEN 1.0 ELSE 0.0 END
    ],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_training_data_query, {"modelName": model_name}).summary().counters)

train_model_query = """
CALL regression.linear.train($modelName)
"""

display(graph.run(train_model_query, {"modelName": model_name}).to_data_frame())

add_test_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Test)<-[:HOSTS]-(host)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, 
     size((list)<-[:REVIEWS]-()) * 1.0,
     CASE WHEN host.superhost THEN 1.0 ELSE 0.0 END
    ],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price, "test"
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_test_data_query, {"modelName": model_name}).to_data_frame())

test_model_query = """
CALL regression.linear.test($modelName)
"""

display(graph.run(test_model_query, {"modelName": model_name}).to_data_frame())

# Clustering Amenities

Listings have amenities, and presumably those amenities have some impact on the price of the listing. The following query shows the average price of listings that have different amenities:

In [None]:
amenity_query = """
MATCH (a:Amenity)<-[:HAS]-(listing)-[:IN_NEIGHBORHOOD]-(:Neighborhood {name: $name})
RETURN a.name, count(*) AS count, avg(listing.price	) AS averagePrice
ORDER BY averagePrice DESC
LIMIT 20
"""

graph.run(amenity_query, {"name": place}).to_data_frame()

There are lots of baby/child related amenities in these high priced listings, but if we wanted to find if a listing has any of them we'd have to create a manual list of them which isn't much fun. Perhaps we can cluster amenities so that the child friendly ones get grouped together?

We can use the Jaccard Similarity algorithm to help us do this. The following query will create an 'amenity similarity graph' - each amenity gets up to 3 'SIMILAR_AMENITY' relationships to other amenities based on their cooccurence in listings.

In [None]:
similar_amenities = """
MATCH (a:Amenity)<-[:HAS]-(listing)
WITH {item:id(a), categories: collect(id(listing))} as userData
WITH collect(userData) as data
CALL algo.similarity.jaccard(data, {topK:3, similarityCutoff:0.1, writeRelationshipType: "SIMILAR_AMENITY", write: true})
YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95
"""

graph.run(similar_amenities).to_data_frame()

We can then run the Label Propagation clustering algorithm over this similarity graph to cluster the amenities.

In [None]:
amenity_clusters = """
call algo.labelPropagation.stream("Amenity", "SIMILAR_AMENITY", {iterations: 10})
YIELD nodeId, label
WITH label, collect(algo.getNodeById(nodeId).name) AS amenities
WHERE size(amenities) > 1
RETURN label, amenities
ORDER BY size(amenities) DESC
"""

graph.run(amenity_clusters).to_data_frame()

Nice! All of our child friendly amenities have clustered together with label 88. Let's store the amenity clusters in our graph:

In [None]:
amenity_clusters = """
call algo.labelPropagation.stream("Amenity", "SIMILAR_AMENITY", {iterations: 10})
YIELD nodeId, label
WITH label, collect(algo.getNodeById(nodeId)) AS amenities
WHERE size(amenities) > 1
UNWIND amenities AS amenity
MERGE (cluster:AmenityCluster {label: label})
MERGE (amenity)-[:IN_CLUSTER]->(cluster)
"""

display(graph.run(amenity_clusters).summary().counters)

amenity_clusters_friendly_name = """
MATCH (cluster:AmenityCluster)<-[:IN_CLUSTER]-(amenity)
WITH cluster, amenity
ORDER BY size((amenity)<-[:HAS]-())
WITH cluster, collect(amenity)[0] AS biggestAmenity
SET cluster.name = "AmenityCluster " + biggestAmenity.name
"""

display(graph.run(amenity_clusters_friendly_name).summary().counters)

Now let's see if those clusters help predict the price of a listing:

In [None]:
model_name = "rental-prices-roomType-superHost-child3"

count_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
RETURN count(*) AS count
"""

count =  graph.run(count_query).to_table()[0][0]


init_query = """
CALL regression.linear.create($modelName, 'Multiple', true, $numberOfVariables)
"""

result = graph.run(init_query, {"modelName": model_name, "numberOfVariables": 4 + count})
display(result.summary().counters)

clear_seen_query = """
MATCH (s:Seen)
REMOVE s:Seen
"""

display(graph.run(clear_seen_query).summary().counters)

add_training_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Train)<-[:HOSTS]-(host)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, 
     size((list)<-[:REVIEWS]-()) * 1.0,
     CASE WHEN host.superhost THEN 1.0 ELSE 0.0 END,
     size((list)-[:HAS]->()-[:IN_CLUSTER]->(:AmenityCluster {label: $childCluster})) * 1.0
    ],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_training_data_query, {"modelName": model_name, "childCluster": 88}).summary().counters)

train_model_query = """
CALL regression.linear.train($modelName)
"""

display(graph.run(train_model_query, {"modelName": model_name}).to_data_frame())

add_test_data_query = """
match (l:Listing)
WITH l.roomType AS roomType, count(*) AS count
WITH collect(roomType) AS roomTypes

MATCH (list:Test)<-[:HOSTS]-(host)
WHERE NOT list:Seen 
CALL regression.linear.add($modelName, 
  apoc.coll.flatten([
    [list.bedrooms + list.bathrooms, 
     size((list)<-[:REVIEWS]-()) * 1.0,
     CASE WHEN host.superhost THEN 1.0 ELSE 0.0 END,
     size((list)-[:HAS]->()-[:IN_CLUSTER]->(:AmenityCluster {label: $childCluster})) * 1.0
    ],
    algo.ml.oneHotEncoding(roomTypes, [list.roomType])
  ]), 
  list.price, "test"
) 
SET list:Seen 
RETURN count(list)
"""

display(graph.run(add_test_data_query, {"modelName": model_name, "childCluster": 88}).to_data_frame())

test_model_query = """
CALL regression.linear.test($modelName)
"""

display(graph.run(test_model_query, {"modelName": model_name}).to_data_frame())

Hmm only a marginal improvement.

# Exercise

* What happens if we use the other amenity clusters rather than just the child friendly one?
* Are there any other features that we can add that would improve the accuracy of our model? 