In [None]:
import pandas as pd

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
from py2neo.bulk import merge_nodes, create_nodes, create_relationships, merge_relationships

In [None]:
from math import ceil
from time import time
import gc
import subprocess

In [None]:
from lg_utils import connect_to_graph, count_nodes, count_rels, load_csv, add_csv_to_graph, add_relationships

In [None]:
graph = connect_to_graph()

## Initial counts

In [None]:
# current counts
number_judges = count_nodes(graph, "Judge")
number_cases = count_nodes(graph, "Case")
number_acts = count_nodes(graph, "Act")

number_judge_rel = count_rels(graph, "JUDGED")
number_act_rel = count_rels(graph, "USES_ACT")

print(f"In graph: judges: {number_judges}, cases: {number_judges}, acts: {number_acts}, judge-case rels: {number_judge_rel}, case-act rels: {number_act_rel}")

## Load judges and add them to the graph

In [None]:
jdf = load_csv('../data/judges_clean.csv', datetime_keys=["start_date", "end_date"], number_rows=10)

In [None]:
jdf.head()

In [None]:
judge_graph_properties = ["judge_id", "judge_position", "judge_female", "judge_start", "judge_end"]
judge_df_keys = ["ddl_judge_id", "judge_position", "female_judge", "start_date_str", "end_date_str"]
judge_index = {"index_name": "idx_judge_id", "index_key": "judge_id"}

add_judges = False
if add_judges:
    add_csv_to_graph(
        graph=graph,
        csv_file="../data/judges_clean.csv",
        label="Judge",
        merge_key=("Judge", "judge_id"),
        graph_keys=judge_graph_properties,
        df_keys=judge_df_keys,
        index_to_add=judge_index
    )

## Load cases

In [None]:
cdf = load_csv('../data/cases/cases_2018.csv', datetime_keys=["date_of_filing", "date_of_decision"], number_rows=10)

In [None]:
cdf.head()

In [None]:
case_graph_properties = [
    "case_id", "year", "state_code", "dist_code", "court_no", "judge_position", "date_of_filing", "date_of_decision"
]
df_column_keys = ["ddl_case_id", "year", "state_code", "dist_code", "court_no", "judge_position", "date_of_filing_str", "date_of_decision_str"]

add_cases = False # flip to false to initiate
if add_cases:
    add_csv_to_graph(
        graph=graph,
        csv_file="../data/cases/cases_2018.csv",
        label="Case",
        merge_key=("Case", "case_id"),
        graph_keys=case_graph_properties,
        index_to_add={"index_name": "idx_case_id", "index_key": "case_id"}

    )

## Wire up judge-case relationships

In [None]:
all_case_ids = pd.read_csv('../data/cases/cases_2018.csv', usecols=['ddl_case_id'])

number_relationships = count_total_rows('../data/judge_case_merge_key.csv')
number_in_graph = count_rels(graph, "JUDGED")
print("Number relationships in data: ", number_relationships, " and in graph: ", number_in_graph)

In [None]:
graph_rel_keys = [("Judge", "judge_id"), ("Case", "case_id")]
rel_df_keys = ["ddl_filing_judge_id", "ddl_case_id"]
prop_dict = { "type": "FILING_JUDGE" }

add_judge_rels = False
if add_judge_rels:
    add_relationships(
        graph=graph, 
        join_csv_file='../data/judge_case_merge_key.csv', 
        relationship_type="FILING",
        existence_id_series=all_case_ids.ddl_case_id, 
        existence_id_key="ddl_case_id",                     
        graph_keys=graph_rel_keys, 
        rel_keys=rel_df_keys, 
        prop_dict=prop_dict,
        df_start=df_read_start)

In [None]:
number_in_graph = count_rels(graph, "JUDGED")
print("Number in graph: ", number_in_graph)

## Load in the acts and sections

In [None]:
act_properties = ["act_id", "total_count", "act_sum"]
act_df_keys = ["act", "count", "act_s"]
act_index = {"index_name": "idx_act_id", "index_key": "act_id"}

add_acts = False
max_iter = None
if add_acts:
    acts_raw = pd.read_csv('../data/keys/act_key.csv')
    acts_raw = acts[3:] # first rows are NA and ' and "
    acts.to_csv('../data/acts.csv')
    
    add_csv_to_graph(
        graph=graph,
        csv_file="../data/acts.csv",
        label="Act",
        merge_key=("Act", "act_id"),
        graph_keys=act_properties,
        df_keys=act_df_keys,
        index_to_add=act_index,
        max_iter=max_iter
    )

In [None]:
# there are _a lot_ of these, and all are central, so distort things, so remove
criminal_procedure_variants = [
    "CODE OF CRIMINAL PROCEDURE, 1973",
    "Code of Criminal Procedure, 1973",
    "Code of Criminal Procedure 1973",
    "CODE OF CRIMINAL PROCEDURE",
    "Criminal Procedure Code",
    "Code of Criminal Procedure, 1973 1974",
    "CodeofCriminalProcedure",
    "Cr.P.C. ",
    "Code of Criminal Procedure",
    "2.Code of Criminal Procedure, 1973",
    "Cr.P.C.",
    "Cr.P.C",
    "Cr.P.c",
    "CR.P.C"
]

## Now do case-act relationships

In [None]:
total_act_section_rels = count_total_rows('../data/acts_sections.csv')
rels_in_graph = count_rels('USES_ACT')
print('Number of total relationships: ', total_act_section_rels, ' and in graph: ', rels_in_graph)

In [None]:
df_read_start = number_in_graph
last_stop = 60020612 # where last stopped - a little manual for now
df_read_start = int(last_stop + rows_per_df)

In [None]:
graph_rel_keys = [("Case", "case_id"), ("Act", "act_id")]
rel_df_keys = ["ddl_case_id", "act"]

add_case_act_rels = False
if add_case_act_rels:
    add_relationships(
        graph=graph, 
        join_csv_file='../data/acts_sections.csv', 
        relationship_type="USES_ACT",
        target_id_series=all_case_ids.ddl_case_id, 
        target_id_key="ddl_case_id",                     
        graph_keys=graph_rel_keys, 
        rel_keys=rel_df_keys, 
        df_start=df_read_start)

## Loading remainder of entities and relationships

What we have:

* States and districts (for doing subgraphs): state_code, dist_code
* Gender properties (similar): female_defendant, female_adv_defendant, female_adv_pet
* Type (type_name), purpose (purpose_name), dispensation (disp_name)
* Sections: but with just the raw text, _not_ with the act in the section file, so will have to reconstruct that
* 