In [1]:
import os
import pandas as pd
import numpy as np
from neo4j import Query, GraphDatabase, RoutingControl, Result # Python database driver 5.13 +

## Database connection

In [3]:
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "imdb" # Have to be neo4j for neo4j aura (but keep it, good for testing on local dev env)

In [4]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
driver.verify_connectivity()

## Utility functions

In [9]:
## Utility
def split_dataframe(df, chunk_size = 5000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

## Get random movies

In [25]:
df_movies = driver.execute_query(
        ''' 
            match (t:Title)
            return t.id as id
            order by rand() limit 1_000_000
        ''',
        database_=DB_NAME,
        routing_=RoutingControl.READ,
        result_transformer_= lambda r: r.to_df()
)
df_movies.head()

Unnamed: 0,id
0,tt0845853
1,tt31407163
2,tt9695204
3,tt3950610
4,tt2394166


## 1 Request at a time (1M transactions)

In [24]:
for chunk in split_dataframe(df_movies,1):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (t:Title{id:row.id})       
            return t{.primaryTitle}
        ''',
        database_=DB_NAME,
        routing_=RoutingControl.READ,
        rows = chunk.to_dict('records')
    )

## Batch size 10 (100k transactions)

In [20]:
for chunk in split_dataframe(df_movies,10): # Loops 100k times
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (t:Title{id:row.id})       
            return t{.primaryTitle}
        ''',
        database_=DB_NAME,
        routing_=RoutingControl.READ,
        rows = chunk.to_dict('records') # Contains 10 records
    )
    #print(records)

## Bath size 100k (10 transactions)

In [22]:
for chunk in split_dataframe(df_movies,100_000):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (t:Title{id:row.id})       
            return t{.primaryTitle}
        ''',
        database_=DB_NAME,
        routing_=RoutingControl.READ,
        rows = chunk.to_dict('records')
    )

In [23]:
for chunk in split_dataframe(df_movies,100_000):
    records, summary, keys = driver.execute_query(
        ''' 
            unwind $rows as row
            match (t:Title{id:row.id})<-[:ACTED_IN|DIRECTED|PRODUCED]-()-[:ACTED_IN|DIRECTED|PRODUCED]->()
            return count(*) as network_size
        ''',
        database_=DB_NAME,
        routing_=RoutingControl.READ,
        rows = chunk.to_dict('records')
    )