In [None]:
import pandas as pd

In [None]:
# from neo4j import GraphDatabase

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
from py2neo.bulk import merge_nodes, create_nodes, create_relationships

In [None]:
from math import ceil
from time import time
import gc

## Connect to graph

In [None]:
uri = "bolt://localhost:7687"
user = "neo4j"
pword = "localadmin"

In [None]:
graph = Graph(uri, user=user, password=pword)

## Load up judges and add them to the graph

In [None]:
jdf = pd.read_csv('../data/judges_clean.csv')

In [None]:
jdf["start_date"] = pd.to_datetime(jdf["start_date"])
jdf["end_date"] = pd.to_datetime(jdf["end_date"])

In [None]:
jdf.describe()

In [None]:
jdf.head(n=10)

In [None]:
"" if pd.isnull(jdf.iloc[3]["end_date"]) else jdf.iloc[3]["end_date"].strftime("%Y-%m-%d")

In [None]:
def manual_commit_judges(start_row=0, end_row=len(jdf)):
    keys = ["judge_id", "judge_position", "judge_female", "judge_start", "judge_end"]
    jslice = jdf[start_row:end_row].iterrows()
    s1 = time()
    data = [
        [row["ddl_judge_id"], 
         row["judge_position"], 
         row["female_judge"],
         row["start_date"].strftime("%Y-%m-%d"),
         "" if pd.isnull(row["end_date"]) else row["end_date"].strftime("%Y-%m-%d")
        ]
    for index, row in jslice]
    e1 = time()
    #print("Time to process list: ", e1 - s1)
    s2 = time()
    create_nodes(graph.auto(), data, labels={"Judge"}, keys=keys)
    e2 = time()
    # print("Time to do TX: ", e2 - s2)

In [None]:
def load_judges(batch_size=10000, delete_first=False):
    gc.collect() # just being cautious, given size of things being handed around
    if delete_first:
        graph.run("match (n: Judge) detach delete n")
    for batch in range(0, ceil(len(jdf)/batch_size)):
        print("Adding judges, batch: ", batch)
        manual_commit_judges(start_row=batch * batch_size, end_row = (batch + 1) * batch_size)
    print("Completed, number judges: ", graph.nodes.match("Judge").count())

In [None]:
# not necessary, but would be analogue of above
load_judges_query = """
    load csv with headers from "file:///judges_clean.csv" as row
    with 
        toInteger(row.ddl_judge_id) as judge_id, 
        row.judge_position as judge_position, 
        toInteger(row.state_code) as judge_state_code,
        toInteger(row.dist_code) as judge_dist_code,
        toInteger(row.court_no) as judge_court_no,
        row.female_judge as judge_female,
        apoc.date.parse(row.start_date, "ms", "dd-MM-yyyy") as judgeStartMs,
        apoc.date.parse(row.end_date, "ms", "dd-MM-yyyy") as judgeEndMs,
    return judge_id, judget_position, judge_state_code, judge_dist_code, judge_cour_no, judge_female limit 10;
"""

In [None]:
# _very_ necessary
# graph.run("CREATE CONSTRAINT idx_judge_id ON (judge:Judge) ASSERT judge.judge_id IS UNIQUE")

## Now load some cases

In [None]:
case_reader = pd.read_csv('../data/cases/cases_2018.csv', iterator=True)
first_cases = case_reader.get_chunk(1e6)

In [None]:
first_cases.head()

In [None]:
cdf = first_cases
cdf["date_of_filing"] = pd.to_datetime(cdf["date_of_filing"])
cdf["date_of_decision"] = pd.to_datetime(cdf["date_of_decision"])

In [None]:
def manual_commit_cases(df, start_row=0, end_row=len(jdf)):
    keys = ["case_id", "year", "state_code", "dist_code", "court_no", "judge_position", "date_of_filing", "date_of_decision"]
    cslice = df[start_row:end_row].iterrows()
    s1 = time()
    data = [
        [row["ddl_case_id"], 
         row["year"], 
         row["state_code"],
         row["dist_code"],
         row["court_no"],
         row["judge_position"],
         row["date_of_filing"].strftime("%Y-%m-%d"),
         "" if pd.isnull(row["date_of_decision"]) else row["date_of_decision"].strftime("%Y-%m-%d")
        ]
    for index, row in cslice]
    e1 = time()
    #print("Time to process list: ", e1 - s1)
    s2 = time()
    create_nodes(graph.auto(), data, labels={"Case"}, keys=keys)
    e2 = time()
    # print("Time to do TX: ", e2 - s2)

In [None]:
def commit_cases_df(cdf, batch_size=50000, delete_first=False):
    gc.collect()
    if delete_first:
        graph.run("match (n: Case) detach delete n")
    
    for batch in range(0, ceil(len(cdf)/batch_size)):
        print("Loading cases, batch: ", batch)
        manual_commit_cases(cdf, start_row=batch * batch_size, end_row = (batch + 1) * batch_size)

In [None]:
# NB
# graph.run("CREATE CONSTRAINT idx_case_id ON (case:Case) ASSERT case.case_id IS UNIQUE")

## Heavy lift: judge-case relationships

In [None]:
# time to add some relationships
df = pd.read_csv('../data/keys/judge_case_merge_key.csv')

In [None]:
len(df)

In [None]:
sdf = df[df.ddl_case_id.isin(cdf.ddl_case_id)]

In [None]:
# free up memory for what comes next
# del df
# del cdf
gc.collect()

In [None]:
sdf.head()

In [None]:
number_relationships = len(sdf)

In [None]:
major_batch_size = 10000
minor_batch_size = 20
number_in_graph = graph.run("match ()-[r:JUDGED]->() return count(r) as count").evaluate()

In [None]:
while number_in_graph < number_relationships:
    start_index = number_in_graph
    end_index = start_index + major_batch_size
    print(f"Adding relationships from {start_index} to {end_index}")
    data = [
        (row["ddl_filing_judge_id"], { "type": "FILING_JUDGE" }, row["ddl_case_id"])
        for index, row in sdf[start_index:end_index].iterrows()
    ]
    
    for i in range(ceil(len(data) / minor_batch_size)):
        if i % 10 == 0:
            print(".", end="")

        create_relationships(graph.auto(), data[i * minor_batch_size:(i + 1) * minor_batch_size], 
                             "JUDGED", start_node_key=("Judge", "judge_id"), end_node_key=("Case", "case_id"))
    
    number_in_graph = graph.run("match ()-[r:JUDGED]->() return count(r) as count").evaluate()
    print("Completed a major addition, number in graph now: ", number_in_graph)

print("Complete! Number in graph: ", number_in_graph)

In [None]:
number_in_graph = graph.run("match ()-[r:JUDGED]->() return count(r) as count").evaluate()

In [None]:
print("In graph: ", number_in_graph, " and in frame: ", len(sdf))

## Load in the acts and sections

In [None]:
def add_acts_to_graph(df, row_keys, node_keys, start_row=0, end_row=None):
    cslice = df[start_row:end_row].iterrows()
    data = [[row[key] for key in row_keys] for index, row in cslice]
    print("Assembled list, adding to graph")
    create_nodes(graph.auto(), data, labels={"Act"}, keys=node_keys)
    # print("Time to do TX: ", e2 - s2)

In [None]:
acts = pd.read_csv('../data/keys/act_key.csv')
acts = acts[3:] # first rows are NA and ' and "
acts.head()

In [None]:
act_df_keys = ["act", "count", "act_s"]
node_keys = ["act_id", "total_count", "act_sum"]

In [None]:
add_acts_to_graph(df=acts, row_keys=act_df_keys, node_keys=node_keys)

In [None]:
len(acts)

In [None]:
# 
# create index idx_act_id for (a:Act) on (a.act_id)

In [None]:
# there are _a lot_ of these, and all are central, so distort things, so remove
criminal_procedure_variants = [
    "CODE OF CRIMINAL PROCEDURE, 1973",
    "Code of Criminal Procedure, 1973",
    "Code of Criminal Procedure 1973",
    "CODE OF CRIMINAL PROCEDURE",
    "Criminal Procedure Code",
    "Code of Criminal Procedure, 1973 1974",
    "CodeofCriminalProcedure",
    "Cr.P.C. ",
    "Code of Criminal Procedure",
    "2.Code of Criminal Procedure, 1973",
    "Cr.P.C.",
    "Cr.P.C",
    "Cr.P.c",
    "CR.P.C"
]

## Now do act-section relationships

In [None]:
read_start = 0
number_rows = 5e6

In [None]:
act_case_df = pd.read_csv('../data/acts_sections.csv', nrows=number_rows, skiprows=read_start)
# first_acts = act_reader.get_chunk(number_rows)

In [None]:
act_case_df.head()

In [None]:
adf = act_case_df[act_case_df.ddl_case_id.isin(cdf.ddl_case_id)]

In [None]:
count_rels = lambda reltype: graph.run(f"match ()-[r:{reltype}]->() return count(r) as count").evaluate()

In [None]:
print('Number of acts in DF: ', len(adf))
print('Number of relationships: ', count_rels('USES_ACT'))

In [None]:
def add_batch_act_relationships(relationship_type="USES_ACT", minor_batch_size=20, major_batch_size=1000, offset=0):
    number_acts = count_rels(relationship_type)
    start_index = number_acts
    end_index = start_index + major_batch_size
    print(f"Adding relationships of type {relationship_type} from {start_index} to {end_index}")
    data = [
        (row["ddl_case_id"], { "type": relationship_type }, row["act"])
        for index, row in adf[start_index:end_index].iterrows()
    ]
    
    start_time = time()
    for i in range(ceil(len(data) / minor_batch_size)):
        if i % 2 == 0:
            print(".", end="")
            
        if i % 5 == 0:
            print("Elapsed time: ", int(time() - start_time))

        create_relationships(graph.auto(), 
                             data[i * minor_batch_size:(i + 1) * minor_batch_size], 
                             relationship_type, 
                             start_node_key=("Case", "case_id"), 
                             end_node_key=("Act", "act_id"))
    
    number_in_graph = count_rels(relationship_type)
    print("Completed a major addition, number in graph now: ", number_in_graph, " took: ", time() - start_time)

In [None]:
add_batch_act_relationships(minor_batch_size=1000, major_batch_size=100000)