In [1]:
import os
import requests
import tqdm

import zipfile
import lzma
import json

import gzip
import csv

import pandas as pd


from neomodel import (config, StructuredNode, StringProperty, IntegerProperty,
    UniqueIdProperty, RelationshipTo, DateTimeProperty, db)

In [2]:
##filename declarations

base_url = "https://case.law/download/bulk_exports/latest/by_jurisdiction/case_text_open/"
headers={"Authorization": "Token 24282fec859e2a58163f54692e254c094d91225b"}
base_folder = "./data"
folder_names = ["ark", "ill", "nc", "nm"]
for item in folder_names:
    os.makedirs(f"{base_folder}/{item}", exist_ok=True)

def get_url_loc(file_name, base_url=base_url, base_folder=base_folder):
    url = os.path.join(base_url, file_name)
    file_loc = os.path.join(base_folder, file_name)
    return url, file_loc    
    
    
text_file_names = [f"{item}/{item}_text.zip" for item in folder_names]
xml_file_names = [f"{item}/{item}_xml.zip" for item in folder_names]

text_file_url_locs = list(map(get_url_loc, text_file_names))
xml_file_url_locs = list(map(get_url_loc, xml_file_names))

In [3]:
def load_cases(fnames, field="id"):
    cases = []
    for fname in fnames:
        with zipfile.ZipFile(fname, 'r') as zip_archive:
            xz_path = next(path for path in zip_archive.namelist() if path.endswith('/data.jsonl.xz'))
            with zip_archive.open(xz_path) as xz_archive, lzma.open(xz_archive) as jsonlines:
                for line in tqdm.tqdm_notebook(jsonlines):
                    record = json.loads(str(line, 'utf-8'))
                    if field:
                        record = record[field]
                    cases.append(record)
    print("Number of Cases: {}".format(len(cases)))
    return pd.DataFrame(cases)

_, text_fnames= zip(*text_file_url_locs)
cases = load_cases(text_fnames, field=None)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm.tqdm_notebook(jsonlines):


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Number of Cases: 358706


In [6]:
cases.head()

Unnamed: 0,id,url,name,name_abbreviation,decision_date,docket_number,first_page,last_page,citations,volume,reporter,court,jurisdiction,cites_to,frontend_url,preview,casebody
0,1869772,https://api.capapi.org/v1/cases/1869772/,"James Joseph STANDLEY, Jr. v. STATE of Arkansas",Standley v. State,1987-11-09,,517,518,"[{'cite': '293 Ark. 517', 'type': 'official'},...","{'volume_number': '293', 'barcode': '320440785...","{'id': 368, 'full_name': 'Arkansas Reports', '...","{'name_abbreviation': 'Ark.', 'name': 'Arkansa...","{'name': 'Ark.', 'id': 34, 'slug': 'ark', 'nam...",[{'cite': '738 S.W.2d 423'}],https://cite.capapi.org/ark/293/517/,[],"{'status': 'ok', 'data': {'judges': [], 'head_..."
1,1869757,https://api.capapi.org/v1/cases/1869757/,Keith Melvin DUBRAY v. STATE of Arkansas,Dubray v. State,1987-11-16,,545,546,"[{'cite': '293 Ark. 545', 'type': 'official'},...","{'volume_number': '293', 'barcode': '320440785...","{'id': 368, 'full_name': 'Arkansas Reports', '...","{'name_abbreviation': 'Ark.', 'name': 'Arkansa...","{'name': 'Ark.', 'id': 34, 'slug': 'ark', 'nam...",[{'cite': '739 S.W.2d 166'}],https://cite.capapi.org/ark/293/545/,[],"{'status': 'ok', 'data': {'judges': [], 'head_..."
2,1876624,https://api.capapi.org/v1/cases/1876624/,Avery Nathan RICHARDSON v. STATE of Arkansas,Richardson v. State,1985-10-28,,259,259,"[{'cite': '287 Ark. 259', 'type': 'official'},...","{'volume_number': '287', 'barcode': '320440785...","{'id': 368, 'full_name': 'Arkansas Reports', '...","{'name_abbreviation': 'Ark.', 'name': 'Arkansa...","{'name': 'Ark.', 'id': 34, 'slug': 'ark', 'nam...","[{'cite': '265 Ark. 964'}, {'cite': '697 S.W.2...",https://cite.capapi.org/ark/287/259/1876624/,[],"{'status': 'ok', 'data': {'judges': ['Purtle, ..."
3,1879926,https://api.capapi.org/v1/cases/1879926/,Sammy Joe ELMORE v. STATE of Arkansas,Elmore v. State,1984-07-09,,184,184,"[{'cite': '283 Ark. 184', 'type': 'official'},...","{'volume_number': '283', 'barcode': '320440785...","{'id': 368, 'full_name': 'Arkansas Reports', '...","{'name_abbreviation': 'Ark.', 'name': 'Arkansa...","{'name': 'Ark.', 'id': 34, 'slug': 'ark', 'nam...",[{'cite': '672 S.W.2d 48'}],https://cite.capapi.org/ark/283/184/,[],"{'status': 'ok', 'data': {'judges': [], 'head_..."
4,1879972,https://api.capapi.org/v1/cases/1879972/,Curtis HOWARD and Billy HOWARD v. STATE of Ark...,Howard v. State,1984-09-10,CR 84-7,221,223,"[{'cite': '283 Ark. 221', 'type': 'official'},...","{'volume_number': '283', 'barcode': '320440785...","{'id': 368, 'full_name': 'Arkansas Reports', '...","{'name_abbreviation': 'Ark.', 'name': 'Arkansa...","{'name': 'Ark.', 'id': 34, 'slug': 'ark', 'nam...","[{'cite': '650 S.W.2d 561'}, {'cite': '279 Ark...",https://cite.capapi.org/ark/283/221/,[],"{'status': 'ok', 'data': {'judges': [], 'head_..."


In [None]:
citation_graph = pd.read_json("data/sub_citations.json",orient="records", lines=True).set_index(0)
citation_graph.head()

In [None]:
citation_meta = pd.read_json("data/sub_metadata.json", lines=True, orient="records").set_index("id")
citation_meta.head(1)

In [None]:
MATCH (c1:Case)-[cite1:cited]->(c2:Case) 
RETURN c1.uid AS case_id,
apoc.temporal.format(c1.decision_date_original,"dd MMMM yyyy") AS case_date,
c2.uid AS cited_id,
apoc.temporal.format(c2.decision_date_original,"dd MMMM yyyy") AS cited_case_date 
ORDER BY case_date DESC 
LIMIT 100;

In [None]:
config.DATABASE_URL = 'bolt://neo4j:zaq!0pl@localhost:7687'

class Jurisdiction(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)

class Court(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)

class Reporter(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    
    
class Case(StructuredNode):
    uid = UniqueIdProperty()
    frontend_url = StringProperty()
    name_abbreviation = StringProperty()
    decision_date_original = DateTimeProperty()

    cites_to = RelationshipTo('Case', 'cited')
    jurisdiction = RelationshipTo('Jurisdiction', 'belongs_to')
    court = RelationshipTo('Court', 'heard_at')
    reporter = RelationshipTo('Reporter', 'reported_by')

In [None]:
def create_nodes(obj, nodes):
    output_nodes = []
    for node in tqdm.tqdm_notebook(nodes):
        cur = obj(**node).save()
        output_nodes.append({"id":node["uid"], "node": cur})
    return output_nodes


In [None]:
def stoi(strid):
    try:
        strid = int(strid)
    except ValueError:
        strid = None
    return strid

In [None]:
citation_meta["cites_to"] = citation_graph[1]
citation_meta["decision_date_original"] = pd.to_datetime(citation_meta.decision_date_original, unit='ms', origin="unix")
citations_uids = frozenset(citation_meta.index)
citation_meta.loc[:,"cites_to"] = (citation_meta
                                   .loc[:,"cites_to"]
                                   .map(lambda x: list(filter(lambda y: y in citations_uids, x))))

In [None]:
jurisdiction_cols = ["jurisdiction__name", "jurisdiction_id"]
court_cols = ["court__name_abbreviation", "court_id"]
reporter_cols = ["reporter__short_name", "reporter_id"]
cases_cols = ["frontend_url", "name_abbreviation", "decision_date_original"]


jurisdiction_nodes = [
    {"name":item[0], "uid": item[1]} for item in 
    citation_meta[jurisdiction_cols].value_counts().index.tolist()]

court_nodes = [
    {"name":item[0], "uid": item[1]} for item in 
    citation_meta[court_cols].value_counts().index.tolist()]

reporter_nodes = [
    {"name":item[0], "uid": item[1]} for item in 
    citation_meta[reporter_cols].value_counts().index.tolist()]

case_nodes = (citation_meta[cases_cols]
               .reset_index()
               .rename({"id": "uid"}, axis=1)
               .to_dict(orient="records"))


In [None]:
# jurisdictions = create_nodes(Jurisdiction, jurisdiction_nodes)
jurisdictions = pd.DataFrame(jurisdictions).set_index('id')

In [None]:
# courts = create_nodes(Court, court_nodes)
courts = pd.DataFrame(courts).set_index('id')

In [None]:
# reporters = create_nodes(Reporter, reporter_nodes)
reporters = pd.DataFrame(reporters).set_index('id')

In [None]:
# cases = create_nodes(Case, case_nodes)
cases = pd.DataFrame(cases).set_index('id')

In [None]:
def make_relationships(citation_meta, cases, jurisdictions, courts, reporters):
    for idx, case in tqdm.tqdm_notebook(cases.iterrows()):
        case = case['node']
        # relate citations
        cites = citation_meta.loc[idx,"cites_to"]
        cites_cases = cases.loc[cites].to_dict(orient="records")
        for cite_case in cites_cases:
            case.cites_to.connect(cite_case['node'])

        # relate jurisdictions
        jur_uid = citation_meta.loc[idx, "jurisdiction_id"]
        jur_node = jurisdictions.loc[jur_uid, 'node']
        case.jurisdiction.connect(jur_node)

        # relate courts
        court_uid = citation_meta.loc[idx, "court_id"]
        court_node = courts.loc[court_uid, 'node']
        case.court.connect(court_node)

        # relate reporters
        reporter_uid = citation_meta.loc[idx, "reporter_id"]
        reporter_node = reporters.loc[reporter_uid, 'node']
        case.reporter.connect(reporter_node)
    return True

In [None]:
make_relationships(citation_meta, cases, jurisdictions, courts, reporters)

In [None]:
graph_embeddings_df = pd.read_csv("data/graph_embeddings.csv").set_index("nodeId")
graph_embeddings_df

In [None]:
import numpy as np

In [None]:
embedding_matrix = np.array(graph_embeddings_df["embedding"].map(eval).tolist())

In [None]:
embedding_matrix.shape

In [None]:
cases["neo_id"] = cases.loc[:, "node"].map(lambda x: x.id)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sims = cosine_similarity(embedding_matrix[:1, :], embedding_matrix)

In [None]:
cases[cases.neo_id.isin(graph_embeddings_df.iloc[np.argsort(sims.ravel())[::-1][:10]].index)].index

In [None]:
citation_meta.loc[1523381]