In [4]:
# Install Neo4j driver
!pip install neo4j pandas

import urllib.request
import pandas as pd
from neo4j import GraphDatabase
import time

# Download dataset
url = "https://snap.stanford.edu/data/wiki-Vote.txt.gz"
urllib.request.urlretrieve(url, "wiki-Vote.txt.gz")
!gunzip wiki-Vote.txt.gz

# Neo4j Aura credentials
URI = "neo4j+s://155cc552.databases.neo4j.io"
AUTH = ("neo4j", "MYPASS") #USED MY PASS HERE

driver = GraphDatabase.driver(URI, auth=AUTH)

gzip: wiki-Vote.txt already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [5]:
# Prepare CSV files
edges = []
nodes = set()

with open('wiki-Vote.txt', 'r') as f:
    for line in f:
        if line.startswith('#'):
            continue
        parts = line.strip().split()
        if len(parts) == 2:
            src, dst = int(parts[0]), int(parts[1])
            edges.append({'src': src, 'dst': dst})
            nodes.add(src)
            nodes.add(dst)

# Create DataFrames
nodes_df = pd.DataFrame({'id': sorted(list(nodes))})
edges_df = pd.DataFrame(edges)

print(f"Nodes: {len(nodes_df)}")
print(f"Edges: {len(edges_df)}")

nodes_df.to_csv('nodes.csv', index=False)
edges_df.to_csv('edges.csv', index=False)

Nodes: 7115
Edges: 103689


In [12]:
###Load data in neo4j
'''
// Clear existing data
MATCH (n) DETACH DELETE n;

// Create unique constraint
CREATE CONSTRAINT user_id IF NOT EXISTS
FOR (u:User) REQUIRE u.id IS UNIQUE;

// Load nodes from Google Drive CSV
LOAD CSV WITH HEADERS FROM 'https://drive.google.com/uc?id=18kfz6J9MiTSLDF5s9wGG-RZ52uGFr7Vc&export=download' AS row
CREATE (:User {id: toInteger(row.id)});

// Load edges from Google Drive CSV
LOAD CSV WITH HEADERS FROM 'https://drive.google.com/uc?id=1mTbuUcUh2-3BNirwZWb6OpFEe1d5htBP&export=download' AS row
MATCH (src:User {id: toInteger(row.src)}), (dst:User {id: toInteger(row.dst)})
CREATE (src)-[:VOTED_FOR]->(dst);
'''

#USING THAT

def load_data(driver):
    with driver.session() as session:

        # Clear existing data
        print("Clearing old data...")
        session.run("MATCH (n) DETACH DELETE n")

        # Create unique constraint (safe)
        print("Creating unique constraint...")
        session.run("""
            CREATE CONSTRAINT user_id IF NOT EXISTS
            FOR (u:User) REQUIRE u.id IS UNIQUE
        """)

        # Load nodes using new transaction batching
        print("Loading nodes.csv...")
        session.run("""
            LOAD CSV WITH HEADERS FROM
            'https://drive.google.com/uc?id=18kfz6J9MiTSLDF5s9wGG-RZ52uGFr7Vc&export=download'
            AS row
            CALL (row) {
                CREATE (:User {id: toInteger(row.id)})
            } IN TRANSACTIONS OF 1000 ROWS
        """)

        # Load edges using new transaction batching
        print("Loading edges.csv...")
        session.run("""
            LOAD CSV WITH HEADERS FROM
            'https://drive.google.com/uc?id=1mTbuUcUh2-3BNirwZWb6OpFEe1d5htBP&export=download'
            AS row
            CALL {
                WITH row
                MATCH (src:User {id: toInteger(row.src)}),
                      (dst:User {id: toInteger(row.dst)})
                CREATE (src)-[:VOTED_FOR]->(dst)
            } IN TRANSACTIONS OF 1000 ROWS
        """)

        print("Data successfully loaded")

        print("Verifying data counts...")
        result = session.run("""
            MATCH (u:User)
            WITH count(u) AS userCount
            MATCH ()-[r:VOTED_FOR]->()
            RETURN userCount, count(r) AS relCount
        """)

        record = result.single()
        user_count = record["userCount"]
        rel_count = record["relCount"]
        print(f"Import summary → Users: {user_count}, Relationships: {rel_count}")

load_data(driver)



Clearing old data...
Creating unique constraint...
Loading nodes.csv...
Loading edges.csv...




Data successfully loaded
Verifying data counts...
Import summary → Users: 7115, Relationships: 103689


In [11]:
import random
from neo4j.exceptions import ServiceUnavailable, SessionExpired

def run_metrics(driver, sample_size=500):
    results = {}

    try:
        with driver.session() as session:
            # 1. Count nodes and edges
            print("1. Counting nodes and edges...")
            start = time.time()
            try:
                nodes_result = session.run("MATCH (n:User) RETURN count(n) AS cnt")
                edges_result = session.run("MATCH ()-[r:VOTED_FOR]->() RETURN count(r) AS cnt")
                nodes = nodes_result.single()['cnt']
                edges = edges_result.single()['cnt']
                results['basic_count'] = time.time() - start
                print(f"   Nodes: {nodes}, Edges: {edges}")
                print(f"   Time: {results['basic_count']:.3f}s")
            except Exception as e:
                print(f"   Error: {e}")
                return None

            if nodes == 0:
                print("   No nodes found")
                return results

            actual_sample_size = min(sample_size, nodes)

            # 2. Get sample nodes
            print(f"2. Sampling {actual_sample_size} nodes...")
            try:
                node_ids = [r['nodeId'] for r in session.run(
                    "MATCH (n:User) RETURN n.id AS nodeId LIMIT $limit",
                    limit=actual_sample_size
                )]
                print(f"   Sampled {len(node_ids)} nodes")
            except Exception as e:
                print(f"   Error: {e}")
                node_ids = []

            if not node_ids:
                return results

            # 3. Weakly Connected Components
            print("3. Computing WCC (approx)...")
            start = time.time()
            max_component_size = 0
            processed = 0

            for nid in node_ids[:50]:
                try:
                    comp_size = session.run("""
                        MATCH (n:User {id: $nid})
                        CALL apoc.path.subgraphNodes(n, {relationshipFilter:'VOTED_FOR|<VOTED_FOR', maxNodes: 1000})
                        YIELD node
                        RETURN count(node) AS size
                    """, nid=nid).single()['size']
                    if comp_size > max_component_size:
                        max_component_size = comp_size
                    processed += 1
                except Exception as e:
                    continue

            results['wcc'] = time.time() - start
            print(f"   Largest WCC: {max_component_size} nodes")
            print(f"   Time: {results['wcc']:.3f}s")

            # 4. Strongly Connected Components
            print("4. Computing SCC (approx)...")
            start = time.time()
            max_scc_size = 0
            scc_sample = random.sample(node_ids, min(20, len(node_ids)))
            processed = 0

            for nid in scc_sample:
                try:
                    scc_result = session.run("""
                        MATCH (n:User {id: $nid})
                        CALL apoc.path.subgraphAll(n, {
                            relationshipFilter: 'VOTED_FOR',
                            maxNodes: 500
                        }) YIELD nodes
                        RETURN size(nodes) AS size
                    """, nid=nid)
                    scc_size = scc_result.single()['size']
                    if scc_size > max_scc_size:
                        max_scc_size = scc_size
                    processed += 1
                except Exception as e:
                    continue

            results['scc'] = time.time() - start
            print(f"   Largest SCC: {max_scc_size} nodes")
            print(f"   Time: {results['scc']:.3f}s")

            # 5. Triangle count
            print("5. Computing triangle counts...")
            start = time.time()
            try:
                triangle_result = session.run("""
                    MATCH (a:User)-[:VOTED_FOR]->(b:User)-[:VOTED_FOR]->(c:User)-[:VOTED_FOR]->(a:User)
                    RETURN count(*) AS tri_count
                """)
                triangle_count = triangle_result.single()['tri_count']
                results['triangles'] = time.time() - start
                print(f"   Triangles: {triangle_count}")
                print(f"   Time: {results['triangles']:.3f}s")
            except Exception as e:
                print(f"   Error: {e}")
                results['triangles'] = time.time() - start

            # 6. Clustering coefficient - fixed
            print("6. Computing Clustering Coefficient...")
            start = time.time()
            try:
                cc_result = session.run("""
                    MATCH (n:User)
                    WITH n LIMIT 50
                    MATCH (n)-[:VOTED_FOR]-(m)
                    WITH n, collect(m) AS neighbors
                    WHERE size(neighbors) >= 2
                    UNWIND range(0, size(neighbors)-2) AS i
                    UNWIND range(i+1, size(neighbors)-1) AS j
                    WITH n, neighbors, neighbors[i] AS n1, neighbors[j] AS n2
                    WHERE (n1)-[:VOTED_FOR]-(n2)
                    WITH n, count(*) AS connected_pairs, size(neighbors) AS k
                    RETURN avg(2.0 * connected_pairs / (k * (k-1))) AS avg_cc
                """)
                avg_cc = cc_result.single()['avg_cc'] or 0
                results['clustering'] = time.time() - start
                print(f"   Avg CC: {avg_cc:.4f}")
                print(f"   Time: {results['clustering']:.3f}s")
            except Exception as e:
                print(f"   Error: {e}")
                results['clustering'] = time.time() - start

            # 7. Diameter approximation - fixed
            print("7. Computing Diameter (approx)...")
            start = time.time()
            try:
                diameter_result = session.run("""
                    MATCH (start:User)
                    WITH start LIMIT 10
                    MATCH (end:User)
                    WITH start, end LIMIT 50
                    WHERE start <> end
                    MATCH path = shortestPath((start)-[:VOTED_FOR*]-(end))
                    WHERE path IS NOT NULL
                    RETURN max(length(path)) AS max_diameter
                """)
                diameter = diameter_result.single()['max_diameter'] or 0
                results['diameter'] = time.time() - start
                print(f"   Diameter: {diameter}")
                print(f"   Time: {results['diameter']:.3f}s")
            except Exception as e:
                print(f"   Error: {e}")
                # Alternative diameter approach
                try:
                    diameter_result = session.run("""
                        MATCH (n:User)
                        WITH n LIMIT 20
                        MATCH (n)-[:VOTED_FOR*1..10]-(m:User)
                        WHERE n <> m
                        RETURN max(length(path)) AS max_diameter
                    """)
                    diameter = diameter_result.single()['max_diameter'] or 0
                    results['diameter'] = time.time() - start
                    print(f"   Diameter (alt): {diameter}")
                    print(f"   Time: {results['diameter']:.3f}s")
                except:
                    results['diameter'] = time.time() - start

        return results

    except ServiceUnavailable as e:
        print(f"Service unavailable: {e}")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def test_conn(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1 AS test")
            return result.single()['test'] == 1
    except Exception as e:
        print(f"Connection failed: {e}")
        return False

print("Testing connection...")
if test_conn(driver):
    print("Connected! Running metrics...")
    results = run_metrics(driver)

    if results:
        print("=" * 50)
        print("RESULTS SUMMARY")
        print("=" * 50)
        for metric, exec_time in results.items():
            print(f"{metric:20s}: {exec_time:.3f}s")
    else:
        print("Failed to compute metrics")
else:
    print("Cannot connect to Neo4j")

Testing connection...
Connected! Running metrics...
1. Counting nodes and edges...
   Nodes: 7115, Edges: 103689
   Time: 0.584s
2. Sampling 500 nodes...
   Sampled 500 nodes
3. Computing WCC (approx)...
   Largest WCC: 7066 nodes
   Time: 17.542s
4. Computing SCC (approx)...
   Largest SCC: 7066 nodes
   Time: 7.249s
5. Computing triangle counts...
   Triangles: 131925
   Time: 1.444s
6. Computing Clustering Coefficient...
   Avg CC: 0.1611
   Time: 2.465s
7. Computing Diameter (approx)...
   Diameter: 2
   Time: 0.450s
RESULTS SUMMARY
basic_count         : 0.584s
wcc                 : 17.542s
scc                 : 7.249s
triangles           : 1.444s
clustering          : 2.465s
diameter            : 0.450s
