# Data Science with Neo4j Using Yelp Data

### Module 0: Loading Yelp Data into Neo4j

### Part 1: Convert Yelp JSON files to CSV format

In [2]:
import csv
import json
import os

data_dir = '/Users/gtenorio/neo4j_yelp/import/'

#### A. Convert Business streaming JSON file to CSV

In [11]:
# Specify CSV parameters through custom dialect
csv.register_dialect('custom', escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)


# Converts the given value to a CSV formatted string
def toCSV(value):
    # Represent a list of items as a semicolon delimited string
    if type(value) == list:
        return ';'.join(value)
    
    # Surround fields with double quotes and handle escape characters
    if type(value) == str:
        return value.replace('"', '').replace('\\', '')

    return value


# Converts a dict to a list of CSV formatted strings, optionally restricting
# Converted fields by passing in an ordered list of fields
def dictToCSV(obj, fields=None):
    fields = fields if fields is not None else obj.keys()
    return [toCSV(obj[k]) for k in fields]


# Takes a JSON file and writes it as a CSV file
def convertJSONFileToCSV(json_file, csv_file):
    with open(json_file, 'r') as jsonFile, \
         open(csv_file, 'w')  as csvFile:

        fields = json.loads(jsonFile.readline()).keys()
        writer = csv.writer(csvFile, dialect='custom')
        writer.writerow(fields)         # write header

        for obj in map(json.loads, jsonFile):
            writer.writerow(dictToCSV(obj, fields))

In [12]:
%%time
convertJSONFileToCSV(data_dir + "business.json", data_dir + 'business.csv')

CPU times: user 13.7 s, sys: 266 ms, total: 14 s
Wall time: 14.3 s


#### B. Convert User streaming JSON file to CSV.  Create separate Friend data.

In [13]:
def convertUserDataForNeo4jImport(users_file, out_dir):
    with open(users_file, 'r') as usersJsonFile, \
         open(os.path.join(out_dir, 'user.csv'), 'w') as usersCSVFile, \
         open(os.path.join(out_dir, 'user_friend.csv'), 'w') as friendsCSVFile:
    
        userFields = ['user_id','name','yelping_since','review_count','average_stars','fans']
        usersWriter = csv.writer(usersCSVFile, dialect='custom')
        usersWriter.writerow(userFields)        # write header

        friendFields = ['user_id', 'friends']
        friendsWriter = csv.writer(friendsCSVFile, dialect='custom')
        friendsWriter.writerow(friendFields)    # write header

        for obj in map(json.loads, usersJsonFile):
            usersWriter.writerow(dictToCSV(obj, userFields))
            friendsWriter.writerow(dictToCSV(obj, friendFields))

In [14]:
%%time
users_json = data_dir + 'user.json'
convertUserDataForNeo4jImport(users_json, data_dir)

CPU times: user 1min 35s, sys: 2.87 s, total: 1min 38s
Wall time: 1min 39s


#### C. Convert Review streaming JSON file to CSV.  Create separate REVIEW_OF data.

In [15]:
def convertReviewDataForNeo4jImport(reviews_file, out_dir):
    with open(reviews_file, 'r') as reviewsJsonFile, \
         open(os.path.join(out_dir, 'review.csv'), 'w') as reviewsCSVFile, \
         open(os.path.join(out_dir, 'review_user_business.csv'), 'w') as rubCSVFile:
          
        reviewFields  = ['review_id', 'date', 'stars', 'useful']
        reviewsWriter = csv.writer(reviewsCSVFile, dialect='custom')
        reviewsWriter.writerow(reviewFields)    # write header

        rubFields = ['user_id', 'review_id', 'business_id']
        rubWriter = csv.writer(rubCSVFile, dialect='custom')
        rubWriter.writerow(rubFields)           # write header

        for obj in map(json.loads, reviewsJsonFile):
            reviewsWriter.writerow(dictToCSV(obj, reviewFields))
            rubWriter.writerow(dictToCSV(obj, rubFields))

In [16]:
%%time
reviews_json = data_dir + 'review.json'
convertReviewDataForNeo4jImport(reviews_json, data_dir)

CPU times: user 2min 24s, sys: 4.93 s, total: 2min 29s
Wall time: 2min 33s


### Part 2: Load CSV files into Neo4j

In [17]:
# py2neo allows us to work with Neo4j from within Python
from py2neo import authenticate, Graph

# Set up authentication parameters
authenticate("localhost:7474", "neo4j", "neo4jneo4j") 

# Connect to authenticated graph database
g = Graph("http://localhost:7474/db/data/")

In [18]:
# Each time this notebook is run, we start with an empty graph database
g.run("MATCH (n) DETACH DELETE n;")    

# We drop and recreate our node constraints
g.run("DROP CONSTRAINT ON (business:Business)   ASSERT business.id   IS UNIQUE;")
g.run("DROP CONSTRAINT ON (category:Category)   ASSERT category.name IS UNIQUE;")
g.run("DROP CONSTRAINT ON (city:City)           ASSERT city.name     IS UNIQUE;")
g.run("DROP CONSTRAINT ON (state:State)         ASSERT state.name    IS UNIQUE;")
g.run("DROP CONSTRAINT ON (user:User)           ASSERT user.id       IS UNIQUE;")
g.run("DROP CONSTRAINT ON (review:Review)       ASSERT review.id     IS UNIQUE;")


g.run("CREATE CONSTRAINT ON (business:Business) ASSERT business.id   IS UNIQUE;")
g.run("CREATE CONSTRAINT ON (category:Category) ASSERT category.name IS UNIQUE;")
g.run("CREATE CONSTRAINT ON (city:City)         ASSERT city.name     IS UNIQUE;")
g.run("CREATE CONSTRAINT ON (state:State)       ASSERT state.name    IS UNIQUE;")
g.run("CREATE CONSTRAINT ON (user:User)         ASSERT user.id       IS UNIQUE;")
g.run("CREATE CONSTRAINT ON (review:Review)     ASSERT review.id     IS UNIQUE;")


<py2neo.database.Cursor at 0x106eb3320>

#### A. Load Business Data into Neo4j

In [20]:
%%time

load_business = """

    // Load and commit every 50000 records
    USING PERIODIC COMMIT 50000 
    LOAD CSV WITH HEADERS FROM {input_dir} AS line                      
    WITH line 
        
    // Create Business nodes
    MERGE (business:Business {id: line.business_id})
    SET business.name         = line.name,
        business.neighborhood = line.neighborhood,
        business.avg_rating   = toFloat(line.stars),
        business.num_reviews  = toInteger(line.review_count)
        
    // Create Category nodes
    WITH line, business, split(line.categories, ";") as cat_list
    UNWIND cat_list as cat
    MERGE (category:Category {name: cat})
    MERGE (business)-[:IN_CATEGORY]->(category)
        
    // Create City and State nodes
    MERGE (city:City {name: line.city})
    MERGE (state:State {name: line.state})
    MERGE (business)-[:IN_CITY]->(city)
    MERGE (business)-[:IN_STATE]->(state)

    """

g.run(load_business, input_dir='file:///business.csv')

<py2neo.database.Cursor at 0x10862de10>

#### B. Load User Data into Neo4j

In [21]:
%%time

load_user = """
        
    // Load and commit every 50000 records
    USING PERIODIC COMMIT 50000 
    LOAD CSV WITH HEADERS FROM {input_dir} AS line                      
    WITH line 
        
    // Create User nodes
    MERGE (user:User {id: line.user_id})
    SET user.name              = line.name,
        user.yelping_since     = line.yelping_since,
        user.num_reviews       = toInteger(line.review_count),
        user.avg_review_rating = toFloat(line.average_stars),
        user.num_fans          = toInteger(line.fans)
        
    """

g.run(load_user, input_dir='file:///user.csv')

CPU times: user 10.5 ms, sys: 7.33 ms, total: 17.8 ms
Wall time: 4min 20s


In [22]:
%%time

# Create FRIENDS_WITH relationship between Users
load_friend = """

    // Load and commit every 50000 records
    USING PERIODIC COMMIT 50000 
    LOAD CSV WITH HEADERS FROM {input_dir} AS line                      
    WITH line  
        
    // Only load Users already in the graph
    MATCH (user:User {id: line.user_id})
        
    // Create FRIENDS_WITH relationship
    WITH line, user, split(line.friends, ";") as friend_list
    UNWIND friend_list as friend
    MATCH (f:User {id: friend})
    MERGE (user)-[:FRIENDS_WITH]->(f)
    
    """

g.run(load_friend, input_dir='file:///user_friend.csv')

CPU times: user 165 ms, sys: 107 ms, total: 272 ms
Wall time: 1h 26min 2s


#### C. Load Review Data

In [24]:
%%time

# First pass will create Review nodes only, not relationships
load_review = """
        
    // Load and commit every 50000 records
    USING PERIODIC COMMIT 50000 
    LOAD CSV WITH HEADERS FROM {input_dir} AS line                      
    WITH line 
        
    // Create Review nodes
    MERGE (review:Review {id: line.review_id})
    SET review.date                   = line.date,
        review.rating                 = toInteger(line.stars),
        review.useful_votes_received  = toInteger(line.useful)
        
    """

g.run(load_review, input_dir='file:///review.csv')

CPU times: user 37.5 ms, sys: 27.5 ms, total: 64.9 ms
Wall time: 27min 26s


In [25]:
%%time

# Second pass creates relationships
load_review_rel = """
        
    // Load and commit every 50000 records
    USING PERIODIC COMMIT 50000 
    LOAD CSV WITH HEADERS FROM {input_dir} AS line                      
    WITH line 
        
    // Only care about Users and Businesses already in the graph
    MATCH (review:Review     {id:line.review_id})
    MATCH (user:User         {id:line.user_id})
    MATCH (business:Business {id:line.business_id})
               
    MERGE (user)-[:WROTE]->(review)
    MERGE (review)-[:REVIEW_OF]->(business)
 
    """

g.run(load_review_rel, input_dir='file:///review_user_business.csv')

CPU times: user 46.1 ms, sys: 32.2 ms, total: 78.3 ms
Wall time: 21min 59s
