In [87]:

def parser(path, filename, save_path, output_name):
    import simplejson as json  
    import os
    import lzma
    import numpy as np
    import pandas as pd
    def get_cases(path, filename):
        
        os.chdir(path)
        cases = []
        with lzma.open(filename) as infile:
            for line in infile:
        #decode the file into a convenient format
                record = json.loads(str(line, 'utf-8'))
        #if the decision date on the case matches one we're interested in, add to our list
                cases.append(record)
        return cases
      
    
    def parse_cases(case):

        elem_dict = {}
        try:
            elem_dict["id"] = case['id']
        except IndexError:
            elem_dict["id"] = np.nan
        try:
            elem_dict["name"] = case['name']
        except IndexError:
            elem_dict["name"] = np.nan
        try:   
            elem_dict["name_abbreviation"] = case['name_abbreviation']
        except IndexError:
            elem_dict["name_abbreviation"] = np.nan
        try:
            elem_dict["parties"] = case['casebody']['data']['parties']
        except IndexError:
             elem_dict["parties"] = np.nan
        try:
            elem_dict["plaintiff"] = case['name_abbreviation'].split('v.')[0].strip()
        except IndexError:
            elem_dict["plaintiff"] = np.nan
        try:
            elem_dict["defendant"] = case['name_abbreviation'].split('v.')[1].strip()
        except IndexError:
            elem_dict["defendant"] = np.nan
        try:
            elem_dict["decision_date"] = case['decision_date']
        except IndexError:
            elem_dict["decision_date"] = np.nan
        try:
            elem_dict["docket_number"] = case['docket_number']
        except IndexError:
            elem_dict["docket_number"] = np.nan

        citation = [citation for citation in case['citations'] if citation['type'] == 'official'][0]
        elem_dict["citation_type"] = citation['type']
        elem_dict["citation_num"] = citation['cite']
        try:
            elem_dict["reporter"] = case['reporter']['full_name']
        except IndexError:
            elem_dict["reporter"] = np.nan
        try:
            elem_dict["court_id"] = case['court']['id']
        except IndexError:
            elem_dict["court_id"] = np.nan
        elem_dict["court_name"] = case['court']['name']
        elem_dict["jurisdiction_id"] = case['jurisdiction']['id']
        elem_dict["jurisdiction_name"] = case['jurisdiction']['name_long']
        try:
            elem_dict["plaintiff_attorneys"] = case['casebody']['data']['attorneys'][0]
        except IndexError:
            elem_dict["plaintiff_attorneys"] = np.nan
        try:
            elem_dict["defendants_attorneys"] = case['casebody']['data']['attorneys'][1]
        except IndexError:
            elem_dict["defendants_attorneys"] = np.nan
        elem_dict["judges"] = case['casebody']['data']['judges']
        opinion_type = [opinion['type'] for opinion in case['casebody']['data']['opinions']]
        elem_dict["opinion_type"]= opinion_type
        opinion_author = [opinion['author'] for opinion in case['casebody']['data']['opinions']]
        elem_dict["opinion_author"] = opinion_author
        elem_dict["head_matter"] = case['casebody']['data']['head_matter']
        opinion_text = [opinion['text'] for opinion in case['casebody']['data']['opinions']]
        elem_dict["opinion_text"] = opinion_text
        return elem_dict

        #parsing the cases
    case_dict = get_cases(path, filename)
    parsed_cases = []
    for case in case_dict:
        doc = parse_cases(case)
        parsed_cases.append(doc)
    #Dumping Cases in Pandas DF
    cases_df = pd.DataFrame(parsed_cases)
    cases_df.to_csv(save_path + str(output_name), index = False)
    return cases_df

In [88]:
test = parser("/Users/samk/Documents/GitHub/UNH_MS_Work/data/raw/New Hampshire-20180831-text/data/",'data.jsonl.xz',
             '/Users/samk/Documents/GitHub/UNH_MS_Work/data/processed/',"NH.csv")

In [83]:
test.head()

Unnamed: 0,citation_num,citation_type,court_id,court_name,decision_date,defendant,defendants_attorneys,docket_number,head_matter,id,...,jurisdiction_name,name,name_abbreviation,opinion_author,opinion_text,opinion_type,parties,plaintiff,plaintiff_attorneys,reporter
0,144 N.H. 131,official,8797,New Hampshire Supreme Court,1999-08-30,,"Tober Law Offices, P.A., of Portsmouth (Stephe...",No. LD-97-009,"No. LD-97-009\nFeld’s Case\nAugust 30, 1999\nT...",105092,...,New Hampshire,Feld’s Case,Feld’s Case,"[JOHNSON, J.]","[JOHNSON, J.\nThe Supreme Court Committee on P...",[majority],[Feld’s Case],Feld’s Case,"Griffith & Associates, PLLC, of Wilton (John P...",New Hampshire Reports
1,144 N.H. 13,official,8797,New Hampshire Supreme Court,1999-07-13,Bennett,"Janice S. Peterson, public defender, of Keene,...",No. 97-387,No. 97-387\nThe State of New Hampshire v. Eric...,105095,...,New Hampshire,The State of New Hampshire v. Eric Bennett,State v. Bennett,"[BRODERICK, j., THAYER, J.,]","[BRODERICK, j.\nAfter a jury trial in Superior...","[majority, concurrence]",[The State of New Hampshire v. Eric Bennett],State,"Philip T. McLaughlin, attorney general (John C...",New Hampshire Reports
2,144 N.H. 138,official,8797,New Hampshire Supreme Court,1999-08-30,,"Shaines & McEachern, P.A., of Portsmouth (Paul...",No. LD-97-008,"No. LD-97-008\nRoberge’s Case\nAugust 30, 1999...",105099,...,New Hampshire,Roberge’s Case,Roberge’s Case,[PER CURIAM.],[MEMORANDUM OPINION\nPER CURIAM.\nThe Supreme ...,[majority],[Roberge’s Case],Roberge’s Case,"Griffith & Associates, PLLC, of Wilton (John P...",New Hampshire Reports
3,144 N.H. 107,official,8797,New Hampshire Supreme Court,1999-08-03,,"Wiggin & Nourie, P.A., of Manchester (Scott A....",No. 97-399,"No. 97-399\nAppeal of Donald E. Savage, Jr. (N...",105102,...,New Hampshire,"Appeal of Donald E. Savage, Jr. (New Hampshire...",Appeal of Savage,"[BRODERICK, J.]","[BRODERICK, J.\nThe petitioner, Donald E. Sava...",[majority],"[Appeal of Donald E. Savage, Jr. (New Hampshir...",Appeal of Savage,"Fitzgerald, & Sessler, P.A., of Laconia (Shawn...",New Hampshire Reports
4,144 N.H. 44,official,8797,New Hampshire Supreme Court,1999-07-21,,"Philip T. McLaughlin, attorney general (Dougla...",No. 97-522,"No. 97-522\nAppeal of William H. Morgan, R.PH....",105108,...,New Hampshire,"Appeal of William H. Morgan, R.PH. (New Hampsh...",Appeal of Morgan,"[HORTON, J.]","[HORTON, J.\nThe petitioner, William H. Morgan...",[majority],"[Appeal of William H. Morgan, R.PH. (New Hamps...",Appeal of Morgan,"William, II. Loftus, P.C., of Lebanon (William...",New Hampshire Reports


In [91]:
test['plaintiff_attorneys'][1]

'Philip T. McLaughlin, attorney general (John C. Kissinger, assistant attorney general, on the brief, and Patrick E. Donovan, assistant attorney general, orally), for the State.'

In [89]:
# list_of_citations=[]
# for i in range(cases_df.shape[0]):
#    # phrases starting with 'see', case-insensitive
#    list_see_citations = (re.findall("see\s.*\d{4}\)", cases_df['opinion_text'][i][0], flags=re.IGNORECASE))

#    # phrases with 'also', case-insensitive
#    list_also_citations = (re.findall("[^s].also\s.*\d{4}\)", cases_df['opinion_text'][i][0], flags=re.IGNORECASE))

#    # phrases that have the 'v.' and end in a year with format 'XXXX)', it also grabs up to 50 characters preceding the 'v.', or a period, whichever comes first
#    list_versus_citations = (re.findall("\.+\s.{0,50}[v]\..{0,100}[0-9]{4}\)", cases_df['opinion_text'][i][0], flags=re.IGNORECASE|re.MULTILINE))

#    if len(list_see_citations)!=0:
#        print(i, list_see_citations)
#        print(i, list_also_citations)
#        print(i, list_versus_citations)
#        print()
#        print()
#        list_of_citations.append([i,list_see_citations, list_also_citations])
#    else:
#        pass
