In [31]:
from py2neo import Graph
import pandas as pd

In [3]:
graph = Graph("bolt://localhost", auth=("neo4j", "neo"))

In [5]:
listings_file = "http://guides.neo4j.com/listings/data/listings.csv"
reviews_file = "http://guides.neo4j.com/listings/data/reviews.csv"

## Listings

In [45]:
constraint_query = """
CREATE CONSTRAINT ON (l:Listing)
ASSERT l.id IS UNIQUE
"""

import_query = """
LOAD CSV WITH HEADERS FROM $listingsFile AS row
WITH row WHERE row.id IS NOT NULL
MERGE (l:Listing {id: row.id})
SET l.name = row.name,
    l.price = toFloat(substring(row.price, 1)),
    l.weeklyPrice = toFloat(substring(row.weekly_price, 1)),
    l.cleaningFee = toFloat(substring(row.cleaning_fee, 1)),
    l.propertyType = row.property_type,
    l.accommodates = toInt(row.accommodates),
    l.bedrooms = toInt(row.bedrooms),
    l.bathrooms = toInt(row.bathrooms),
    l.availability365 = toInt(row.availability_365)
"""

graph.run(constraint_query)
graph.run(import_query, {"listingsFile": listings_file})

<py2neo.database.Cursor at 0x11c711128>

## Neighborhoods

In [46]:
constraint_query = """
CREATE CONSTRAINT ON (n:Neighborhood) 
ASSERT n.id IS UNIQUE
"""

import_query = """
LOAD CSV WITH HEADERS FROM $listingsFile AS row
WITH row WHERE row.id IS NOT NULL
MATCH (l:Listing {id: row.id})
MERGE (n:Neighborhood {id: coalesce(row.neighbourhood_cleansed, "NA")})
ON CREATE SET n.name = row.neighbourhood
MERGE (l)-[:IN_NEIGHBORHOOD]->(n);
"""

graph.run(constraint_query)
graph.run(import_query, {"listingsFile": listings_file})

<py2neo.database.Cursor at 0x11c711940>

## Amenities

In [47]:
constraint_query = """
CREATE CONSTRAINT ON (a:Amenity) 
ASSERT a.name IS UNIQUE;
"""

import_query = """
LOAD CSV WITH HEADERS FROM $listingsFile AS row
WITH row WHERE row.id IS NOT NULL
MATCH (l:Listing {id: row.id})
WITH l, split(replace(replace(replace(row.amenities, '{', ''), '}', ''), '\"', ''), ',') AS amenities
UNWIND amenities AS amenity
MERGE (a:Amenity {name: amenity})
MERGE (l)-[:HAS]->(a)
"""

graph.run(constraint_query)
graph.run(import_query, {"listingsFile": listings_file})

<py2neo.database.Cursor at 0x11c711d68>

## Hosts

In [48]:
constraint_query = """
CREATE CONSTRAINT ON (h:Host) 
ASSERT h.id IS UNIQUE
"""

import_query = """
LOAD CSV WITH HEADERS FROM $listingsFile AS row
WITH row WHERE row.host_id IS NOT NULL
MERGE (h:Host {id: row.host_id})
ON CREATE SET h.name      = row.host_name,
              h.about     = row.host_abot,
              h.superhost = CASE WHEN row.host_is_super_host = "t" THEN True ELSE False END,
              h.location  = row.host_location,
              h.image     = row.host_picture_url
WITH row, h
MATCH (l:Listing {id: row.id})
MERGE (h)-[:HOSTS]->(l);
"""

graph.run(constraint_query)
graph.run(import_query, {"listingsFile": listings_file})

<py2neo.database.Cursor at 0x11c715400>

## Reviews

In [51]:
user_constraint_query = """
CREATE CONSTRAINT ON (u:User) 
ASSERT u.id IS UNIQUE
"""

review_constraint_query = """
CREATE CONSTRAINT ON (r:Review) 
ASSERT r.id IS UNIQUE
"""


import_query = """
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM $reviewsFile AS row

// User
MERGE (u:User {id: row.reviewer_id})
SET u.name = row.reviewer_name

// Review
MERGE (r:Review {id: row.id})
SET r.date     = row.date,
    r.comments = row.comments
WITH row, u, r
MATCH (l:Listing {id: row.listing_id})
MERGE (u)-[:WROTE]->(r)
MERGE (r)-[:REVIEWS]->(l);
"""

graph.run(user_constraint_query)
graph.run(review_constraint_query)
graph.run(import_query, {"reviewsFile": reviews_file})

<py2neo.database.Cursor at 0x11c720908>

Let's see what we've imported. Run the following query to check how many nodes our database contains:

In [52]:
query = """
MATCH () 
RETURN COUNT(*) AS nodeCount
"""

graph.run(query).to_data_frame()

Unnamed: 0,nodeCount
0,248337


Let's drill down a bit. What types of nodes do we have?

In [38]:
result = {"label": [], "count": []}
for label in graph.run("CALL db.labels()").to_series():
    query = f"MATCH (:`{label}`) RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["label"].append(label)
    result["count"].append(count)
pd.DataFrame(data=result).sort_values("count")

Unnamed: 0,label,count
2,Neighborhood,41
1,Amenity,42
3,Host,4633
0,Listing,5835
4,User,111834
5,Review,125952


And what types of relationships?

In [46]:
result = {"relType": [], "count": []}
for relationship_type in graph.run("CALL db.relationshipTypes()").to_series():
    query = f"MATCH ()-[:`{relationship_type}`]->() RETURN count(*) as count"
    count = graph.run(query).to_data_frame().iloc[0]['count']
    result["relType"].append(relationship_type)
    result["count"].append(count)
pd.DataFrame(data=result).sort_values("count")

Unnamed: 0,relType,count
1,IN_NEIGHBORHOOD,5835
2,HOSTS,5835
3,WROTE,62976
4,REVIEWS,62976
0,HAS,82561


Now let's explore the neighborhood data:

In [36]:
exploratory_query = """
MATCH (n:Neighborhood)<-[:IN_NEIGHBORHOOD]-(l:Listing)-[:HAS]->(a:Amenity) 
RETURN n.name AS neighborhood, l.name AS name, collect(a.name) AS amenities, l.price AS price 
LIMIT 25
"""

graph.run(exploratory_query).to_data_frame()

Unnamed: 0,amenities,name,neighborhood,price
0,"[Air Conditioning, Wireless Internet, Internet...",THE place in Austin for SXSW,,220.0
1,"[Internet, Wireless Internet, Air Conditioning...","Good room for F1, 5minutes to metro",,150.0
2,"[TV, Internet, Cat(s), Pets live on this prope...",SXSW Rental! 10 min from it all!,Oak Hill,300.0
3,"[Dryer, Smoke Detector, Carbon Monoxide Detect...",A skip to everything SXSW 2015!,,200.0
4,"[TV, Cable TV, Internet, Wireless Internet, Ai...",Beautiful SoCo House near downtown,,749.0
5,"[Shampoo, Washer, Heating, Pets Allowed, Smoki...",For ACL 1 Bedroom Studio House,,120.0
6,"[Essentials, Shampoo, Safety Card, Fire Exting...",Comfy Private Guest Room,,89.0
7,"[Carbon Monoxide Detector, Safety Card, Dryer,...",Zilker Hayloft - Near Downtown,,125.0
8,[],COZY little place ;),Hancock,35.0
9,[],Room for rent in co op.,,28.0


What are the most expensive places to live?

In [4]:
query = """
MATCH (l:Listing)-[:IN_NEIGHBORHOOD]->(n:Neighborhood)
WITH n, avg(l.price) AS averagePrice
RETURN n.id AS zip, n.name AS neighborhood, averagePrice
"""

(graph.run(query).to_data_frame()
 .sort_values("averagePrice", ascending=False)
 .head(10))

Unnamed: 0,averagePrice,neighborhood,zip
29,391.473684,Steiner Ranch,78732
6,316.593939,Barton Hills,78746
7,299.970822,Clarksville,78703
25,273.533333,,78725
2,266.29772,,78704
9,265.03937,,78702
16,258.25,,78734
30,257.252427,Northwest Hills,78731
36,251.645833,Downtown,78701
11,240.0,Oak Hill,78735
