In [1]:
from neo4j import GraphDatabase
import pandas as pd
from tabulate import tabulate
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

In [2]:
user="neo4j"
password="neo4j123"

In [3]:
driver = GraphDatabase.driver("bolt://localhost", auth=(user, password))

In [4]:
def print_result_summary(summary):
    if summary.counters.nodes_created > 0 :
        print(f"Created {summary.counters.nodes_created:,d} nodes")
    if summary.counters.nodes_deleted > 0 :
        print(f"Deleted {summary.counters.nodes_deleted:,d} nodes")
    if summary.counters.relationships_created > 0 :
        print(f"Created {summary.counters.relationships_created:,d} relationships")
    if summary.counters.relationships_deleted > 0 :
        print(f"Deleted {summary.counters.relationships_deleted:,d} relationships")
    if summary.counters.properties_set > 0 :
        print(f"Set {summary.counters.properties_set:,d} properties")
    if summary.counters.labels_added > 0 :
        print(f"Added {summary.counters.labels_added:,d} labels")
    if summary.counters.labels_removed > 0 :
        print(f"Removed {summary.counters.labels_removed:,d} labels")
    if summary.counters.constraints_added > 0 :
        print(f"Added {summary.counters.constraints_added:,d} constraints")
    if summary.counters.constraints_removed > 0 :
        print(f"Removed {summary.counters.constraints_removed:,d} constraints")


# Add constraints

In [5]:
addConstraintsQ1 = "CREATE CONSTRAINT User_constraint FOR (user:User) REQUIRE user.id IS UNIQUE;"
addConstraintsQ2 = "CREATE CONSTRAINT Status_constraint FOR (status:Status) REQUIRE status.id IS UNIQUE;"

queries = [addConstraintsQ1, addConstraintsQ2]

with driver.session() as session:
    for query in queries:    
        result = session.run(query)
        summary = result.consume()
        print_result_summary(summary)

Added 1 constraints
Added 1 constraints


# List constraints

In [6]:
query = "CALL db.constraints;"
with driver.session() as session:
    result = session.run(query)
    df = pd.DataFrame([dict(record) for record in result])
    print(df)

                name                                        description  \
0  Status_constraint  CONSTRAINT ON ( status:Status ) ASSERT (status...   
1    User_constraint  CONSTRAINT ON ( user:User ) ASSERT (user.id) I...   

                                             details  
0  Constraint( id=6, name='Status_constraint', ty...  
1  Constraint( id=4, name='User_constraint', type...  


# Import user nodes

In [7]:
importUserQ = """
USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM "file:///users20221027.csv" AS row
MERGE (u:User {id:toInteger(row.id)})
SET u.screen_name = row.screen_name,
u.name = row.name,
u.friends_count = toInteger(row.friends_count),
u.followers_count = toInteger(row.followers_count),
u.statuses_count = toInteger(row.statuses_count),
u.favorites_count = toInteger(row.favorites_count),
u.description = row.description;
"""
with driver.session() as session:
    result = session.run(importUserQ)
    summary = result.consume()
    print_result_summary(summary)

Created 1,511,659 nodes
Set 12,093,272 properties
Added 1,511,659 labels


In [8]:
query = """
MATCH (u:User) 
RETURN COUNT(u) AS User_count;
"""
with driver.session() as session:
    result = session.run(query)
    df = pd.DataFrame([dict(record) for record in result])
    print(df)

   User_count
0     1511659


# Import user_friend relationships
This may take a long long time to run (depending on relationship count)

In [9]:
importUserFriendQ = """
USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM 'file:///userFriends20221027.csv' as row
MATCH (user:User{id: toInteger(row.user_id)})
MATCH (friend:User{id: toInteger(row.friend_id)})
MERGE (user)-[:FOLLOWS]->(friend);
"""
with driver.session() as session:
    result = session.run(importUserFriendQ)
    summary = result.consume()
    print_result_summary(summary)

Created 9,639,768 relationships


# Import status nodes

In [10]:
importStatusQ = """
USING PERIODIC COMMIT LOAD CSV WITH HEADERS FROM "file:///statuses20221027_labeled.csv" AS row
MERGE (s:Status {id:toInteger(row.id)})
SET s.user_id = toInteger(row.user_id),
s.is_retweet = toInteger(row.is_retweet),
s.retweeted_id = toInteger(row.retweeted_id),
s.is_retweeted = toInteger(row.is_retweeted),
s.retweet_count = toInteger(row.retweet_count),
s.favourite_count = toInteger(row.favourite_count),
s.quoted_id = toInteger(row.quoted_id),
s.hashtag_count = toInteger(row.hashtag_count),
s.text = row.text,
s.is_spam = toInteger(row.label);
"""
with driver.session() as session:
    result = session.run(importStatusQ)
    summary = result.consume()
    print_result_summary(summary)

Created 57,695 nodes
Set 634,645 properties
Added 57,695 labels


In [11]:
query = """
MATCH (s:Status) 
RETURN COUNT(s) AS Status_count;
"""
with driver.session() as session:
    result = session.run(query)
    df = pd.DataFrame([dict(record) for record in result])
    print(df)

   Status_count
0         57695


# Add RT relationships

In [12]:
query = """
MATCH (rt:Status)
WHERE rt.is_retweet = 1
MATCH (t:Status)
WHERE t.is_retweet = 0 AND rt.retweeted_id=t.id
MERGE (rt)-[:RT]->(t);
"""
with driver.session() as session:
    result = session.run(query)
    summary = result.consume()
    print_result_summary(summary)

Created 49,246 relationships


# Add TWEETS relationships

In [13]:
query = """
MATCH (t:Status)
MATCH (u:User)
WHERE t.user_id = u.id
MERGE (u)-[:TWEETS]->(t);
"""
with driver.session() as session:
    result = session.run(query)
    summary = result.consume()
    print_result_summary(summary)

Created 57,218 relationships


# Spam label of a tweet

In [14]:
query = """
MATCH (s:Status)
WHERE s.is_spam=1
SET s:Spam
"""
with driver.session() as session:
    result = session.run(query)
    summary = result.consume()
    print_result_summary(summary)

Added 248 labels


# Retweet of a spam is also a spam

In [15]:
query = """
MATCH (s:Status)-[:RT]->(:Spam)
SET s:Spam
"""
with driver.session() as session:
    result = session.run(query)
    summary = result.consume()
    print_result_summary(summary)

Added 1,419 labels


In [16]:
query = """
MATCH (s:Spam)
RETURN COUNT(s) AS Spam_count
"""
with driver.session() as session:
    result = session.run(query)
    df = pd.DataFrame([dict(record) for record in result])
    print(df)

   Spam_count
0        1667


# A spammer is a user who tweets a spam

In [17]:
setNonSpammer = """
MATCH (u:User)-[:TWEETS]->()
SET u.is_spammer=0;
"""
setSpammer = """
MATCH (u:User)-[:TWEETS]->(:Spam)
SET u.is_spammer=1;
"""
with driver.session() as session:
    result = session.run(setNonSpammer)
    print_result_summary(result.consume())
    result = session.run(setSpammer)
    print_result_summary(result.consume())

Set 57,218 properties
Set 1,665 properties


In [18]:
driver.close()