Annotation script, designed to assist hand-annotation of constitutional paragraphs.

In [None]:
import json


base_path = 'path/to/legal-linking/data'

# assumes that the full case dataset has already been scraped
data_path = 'path/to/ussc_out_full.json'

# list of cases to annotate
urls_to_code = ['https://www.law.cornell.edu/supremecourt/text/381/479',
                'https://www.law.cornell.edu/supremecourt/text/11-393'
                'https://www.law.cornell.edu/supremecourt/text/418/683',
                'https://www.law.cornell.edu/supremecourt/text/384/436',
                'https://www.law.cornell.edu/supremecourt/text/491/397']

# output path
coded_path = 'path/to/legal-linking/data/validation/hand_coded_cases.json'

# constitution index path
index_path = '/home/rbshaffer/Desktop/constitution_files/constitution.json'

with open(index_path) as f:
    index_content = json.loads(f.read())

In [None]:
# print out list of possible indices (to help with coding)
for i in index_content:
    print(i, index_content[i]['link'])
    if not index_content[i]['text']:
        print('NO TEXT')

In [None]:
# annotation loop
for url_to_code in urls_to_code:
    
    # find the case given by the URL
    with open('/home/rbshaffer/Desktop/constitution_files/ussc_out_full.json') as f:
        case_to_code = None

        for case in f.readlines():
            json_case = json.loads(case)

            if not json_case:
                continue

            url = json_case[0]['meta']['source_url']

            if url == url_to_code:
                case_to_code = json_case
                break
                
    if not case_to_code:
        print('The following case was not found:', url_to_code)
        continue
    
    # for each case paragraph, display the text, then prompt for links
    for i, par in enumerate(case_to_code):
        print(i, par['text'])

        match_ids = []
        id_val = None
        while True:
            id_val = input('Input match ID: ')

            if not id_val:
                more = input('Press enter if these are all the matches:' + repr(match_ids))
                if more:
                    continue
                else:
                    break

            if id_val not in index_content:
                print('Error! Match ID not found.')
            else:
                incorrect = input('Press enter if this is the correct match: ' + index_content[id_val]['link'])
                if not incorrect:
                    match_ids.append(id_val)

        match_ids = list(set(match_ids))
        case_to_code[i]['matches'] = [['', '', id_val] for id_val in match_ids]

    coded.append(case_to_code)

In [None]:
# split each case and write the output
cases = {}

with open(coded_path) as f:
    for row in f.readlines():
        par = json.loads(row)
        if par['meta']['source_url'] not in cases:
            cases[par['meta']['source_url']] = [par]
        else:
            cases[par['meta']['source_url']].append(par)

In [None]:
# write the outputs of each file separately
with open('/home/rbshaffer/Desktop/constitution_files/griswold_connecticut.json', 'w') as f:
    for par in coded[0]:
        f.write(json.dumps(par) + '\n')
        
with open('/home/rbshaffer/Desktop/constitution_files/nfib_sebelius.json', 'w') as f:
    for par in coded[1]:
        f.write(json.dumps(par) + '\n')      
        
with open('/home/rbshaffer/Desktop/constitution_files/us_nixon.json', 'w') as f:
    for par in coded[2]:
        f.write(json.dumps(par) + '\n')

with open('/home/rbshaffer/Desktop/constitution_files/miranda_arizona.json', 'w') as f:
    for par in coded[3]:
        f.write(json.dumps(par) + '\n')
        
with open('/home/rbshaffer/Desktop/constitution_files/texas_johnson.json', 'w') as f:
    for par in coded[4]:
        f.write(json.dumps(par) + '\n')