In [1]:
import pandas as pd
import tqdm
import spacy
from gensim.parsing.preprocessing import preprocess_string, DEFAULT_FILTERS



In [2]:
fname = "../data/subset/casebody.json"

In [3]:
casebody_data = pd.read_json(fname, lines=True, orient="records")

In [4]:
casebody_data.head()

Unnamed: 0,id,judges,head_matter,corrections,opinions,attorneys,parties
0,1879972,[],Curtis HOWARD and Billy HOWARD v. STATE of Ark...,,"[{'text': 'George Rose Smith, Justice. The two...","[James M. Simpson, for appellant Curtis Howard...",
1,1360455,[],Empire Rice Mill Company v. Stone.\nOpinion de...,,"[{'text': 'McCullotjch, C. J. Appbllánt instit...","[Cooley & Adams and Rogers, Rarber ■& Henry, f...",[Empire Rice Mill Company v. Stone.]
2,1360511,[],Autrey v. State.\nOpinion delivered November 1...,,"[{'text': 'Hart, J. (after stating the facts)....","[G. G. Billings, for appellant., J. 8. Utley, ...",[Autrey v. State.]
3,1360553,[],Shaw v. State.\nOpinion delivered September 25...,,"[{'text': 'Hart, J. Gid Shaw prosecuted this a...","[M. S. Cobh, for appellant., J. 8. Utley, Atto...",[Shaw v. State.]
4,1360493,[],Mitchell v. State.\nOpinion delivered October ...,,"[{'text': 'Hart, J. Pearl Mitchell' prosecutes...","[Sullins é Ivie, for appellant., J. S. TJtley,...",[Mitchell v. State.]


In [5]:
nlp = spacy.load("en_core_web_sm")

In [7]:
casetext = casebody_data["head_matter"] + "\n"+ casebody_data["opinions"].map(lambda x: " ".join(y.get("text", "") for y in x))


In [10]:
del casebody_data

In [8]:
sample_casetext = casetext[:100]

In [23]:
with open("../data/subset/caselines.txt", "w+") as outfile:
    tags = frozenset(["ADJ","ADV","NOUN","PRON","PROPN"])
    for doc in tqdm.tqdm_notebook(nlp.pipe(casetext,batch_size=10, n_process=-1), total=len(casetext)):
        tokens = []
        for tok in doc:
            if tok.pos_ in tags and tok.is_alpha:
                tokens.append(tok.lemma_.lower())
        tokens = " ".join(tokens)
        tokens = preprocess_string(tokens,DEFAULT_FILTERS[:-1])
        outline = " ".join(tokens) + "\n"
        outfile.write(outline)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm.tqdm_notebook(nlp.pipe(casetext,batch_size=10, n_process=-1), total=len(casetext)):


  0%|          | 0/265773 [00:00<?, ?it/s]

In [None]:
docker run \
    --name caselaw-neo4j \
    -p7474:7474 -p7687:7687 \
    -d \
    -v $PWD/neo4j/data:/data \
    -v $PWD/neo4j/logs:/logs \
    -v $PWD/neo4j/import:/var/lib/neo4j/import \
    -v $PWD/neo4j/plugins:/plugins \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    --env NEO4J_AUTH='neo4j/zaq!0pl' \
    --env NEO4JLABS_PLUGINS='["apoc", "graph-data-science"]' \
    neo4j:latest
                            
                            
CREATE CONSTRAINT jurisdictionId IF NOT EXISTS on (jur:Jurisdiction) ASSERT jur.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM 'file:///jurisdictions.csv' AS row
WITH row
MERGE (jurisdiction:Jurisdiction {id:toInteger(row.id)})
ON CREATE SET jurisdiction.name = row.name_long,
jurisdiction.url = row.url;

CREATE CONSTRAINT courtId IF NOT EXISTS on (cou:Court) ASSERT cou.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM 'file:///courts.csv' AS row
WITH row
MERGE (court:Court {id: toInteger(row.id)})
ON CREATE SET court.name = row.name,
court.url = row.url;

CREATE CONSTRAINT reporterId IF NOT EXISTS on (rep:Reporter) ASSERT rep.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM 'file:///reporters.csv' AS row
WITH row
MERGE (reporter:Reporter {id: toInteger(row.id)})
ON CREATE SET reporter.name = row.full_name,
reporter.url = row.url;

CREATE CONSTRAINT volumeId IF NOT EXISTS on (vol:Volume) ASSERT vol.id IS UNIQUE;
LOAD CSV WITH HEADERS FROM 'file:///volumes.csv' AS row
WITH row
MERGE (volume:Volume {id: toInteger(row.volume_number)})
ON CREATE SET volume.barcode = row.barcode,
volume.url = volume.url;

CREATE CONSTRAINT caselawId IF NOT EXISTS on (cas:Caselaw) ASSERT cas.id IS UNIQUE;
:auto USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///cases.csv' AS row
WITH row
MATCH (volume:Volume {id: toInteger(row.volume_id)})
MATCH (court:Court {id: toInteger(row.court_id)})
MATCH (reporter:Reporter {id: toInteger(row.reporter_id)})
MATCH (jurisdiction:Jurisdiction {id: toInteger(row.jurisdiction_id)})
MERGE (caselaw:Caselaw {id: toInteger(row.id)})
MERGE (caselaw) -[:REPORTED_BY]->(reporter)
MERGE (caselaw) -[:HEARD_BY]->(court)
MERGE (caselaw) -[:IN_VOLUME]->(volume)
MERGE (caselaw) -[:UNDER_JURISIDICTION]->(jurisdiction)
on CREATE SET caselaw.decision_date = datetime(row.decision_date),
caselaw.name = row.name,
caselaw.url = row.url;

:auto USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM 'file:///citations.csv' AS row
with row
MATCH (case1: Caselaw {id: toInteger(row.src)})
MATCH (case2: Caselaw {id: toInteger(row.dst)})
MERGE (case1) -[:CITED]-> (case2);





//Queries
MATCH g=(j1:Jurisdiction)<-[:UNDER_JURISIDICTION]-(c1:Caselaw)<-[:CITED]-(c2),
(c1)-[:HEARD_BY]->(court:Court)
RETURN j1.name as Jur, court.name as court, c1.name as caselaw, count(DISTINCT c2) as citations
ORDER BY citations DESC
LIMIT 25;



MATCH g=(j1:Jurisdiction)<-[:UNDER_JURISIDICTION]-(c1:Caselaw)<-[:CITED]-(c2)
with j1, c1, count(DISTINCT c2) as citations
ORDER BY citations desc
WITH j1.name as Jur, collect({caselaw: c1.name, citations: citations}) as top_k
RETURN Jur, top_k[0..5]
ORDER BY Jur DESC
LIMIT 5;



CALL gds.alpha.degree.stream({
  nodeProjection: 'Caselaw',
  relationshipProjection: {
    Cited: {
      type: 'CITED',
      orientation: 'REVERSE'
    }
  }
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score AS citations
ORDER BY citations DESC


//top cases by jurisdiction and court
MATCH g=(j1:Jurisdiction)<-[:UNDER_JURISIDICTION]-(c1:Caselaw)<-[:CITED]-(c2),
(c1)-[:HEARD_BY]->(co:Court)
with j1, c1, co, count(DISTINCT c2) as citations
ORDER BY citations desc
WITH j1.name as Jur, co.name as court, collect(c1.id) as top_k
RETURN Jur, court, top_k[0..10]
ORDER BY Jur DESC;

//count of cases by jurisdiction and court
MATCH g=(j1:Jurisdiction)<-[:UNDER_JURISIDICTION]-(c1:Caselaw)-[:HEARD_BY]->(co:Court)
with j1, c1, co, collect(c1.id) as case_id
WITH j1.name as Jur, co.name as court, count(DISTINCT case_id) as num_cases
RETURN Jur, court, num_cases
ORDER BY num_cases DESC;


GIVEN A SINGLE STATE FIND STATES AND THEIR CORRESPONDING CITATION COUNTS


//count of cases in a court and other courts they cite to
MATCH (j1:Court)<-[:HEARD_BY]-(c1:Caselaw)-[:CITED]-(c2:Caselaw)-[:HEARD_BY]->(j2:Court)
WITH j1, j2, count(DISTINCT c2) as cids
ORDER BY cids DESC
RETURN j1.name as src_court, collect(j2.name) as dest_court, cids as cited_cases;

//count of cases by jurisdiction
MATCH g=(j1:Jurisdiction)<-[:UNDER_JURISIDICTION]-(c1:Caselaw)<-[:CITED]-(c2)
with j1, c1, count(DISTINCT c2) as citations
ORDER BY citations desc
WITH j1.name as Jur, collect(c1.name)[0] as caselaw, collect( citations)[0] as citations
RETURN Jur, caselaw, citations
ORDER BY Jur ASC
LIMIT 5;