# Build gold standard from WikiSource


Goal:

```
category = {
    'name_of_categoryA': [opinion_id_1, opinion_id_2, ...],
    'name_of_categoryB': [opinion_id_1, opinion_id_2, ...],
    }
```

seed: opinion_id_1
-> recommendation is relevant if `opinion_id_X` shares one of the categories of `opinion_id_1`.


In [113]:
import os
import numpy as np
import pandas as pd
import json
import re
import pickle
import itertools
from tqdm import tqdm

from datasets.courtlistener_data_loader import get_opinions_from_dump_dir

In [2]:
OUT_DIR = 'out'

In [3]:
# Read articles
pages = []

for fn in os.listdir(OUT_DIR):
    if fn.endswith('.json'):
        name = fn.replace('.json', '').replace('Category:', '')
        with open(os.path.join(OUT_DIR, fn)) as f:
            for page in json.load(f):
                p = {
                    'id': page['pageid'],
                    'title': page['title'],
                    'category': name,
                }
                
                if 'meta' in page:
                    p['has_meta'] = True
                    m = page['meta']
                    
                    for field in ['case no', 'casename', 'decided', 'party1', 'party2']:
                        if field in m:
                            p[field] = m[field]
                else:
                    p['has_meta'] = False
                    
                pages.append(p)
                #print(page['title'])
                #print()
                
#            break


In [4]:
df = pd.DataFrame(pages)

In [5]:
print(f'Total rows: {len(df)}')

Total rows: 2939


In [6]:
df.head()

Unnamed: 0,id,title,category,has_meta,case no,casename,decided,party1,party2
0,704876,Ableman v. Booth (59 U.S. 479),United States Supreme Court decisions on slavery,True,35-In error to the supreme,Ableman v. Booth,,Ableman,Booth
1,706392,Ableman v. Booth (62 U.S. 506),United States Supreme Court decisions on slavery,True,,Ableman v. Booth (62 U.S. 506),"March 7, 1859",Stephen V. R. Ableman,Sherman M. Booth
2,669193,The Antelope (23 U.S. 66),United States Supreme Court decisions on slavery,True,,The Antelope,,The Vice-Consuls of Spain and Portugal,
3,670219,The Antelope (25 U.S. 546),United States Supreme Court decisions on slavery,True,,The Antelope,,The Antelope,
4,26512,Dred Scott v. Sandford,United States Supreme Court decisions on slavery,True,,,"March 6, 1857",Dred Scott,Sandford


In [7]:
 df['title'].value_counts()

Schechter Poultry Corporation v. United States                              6
Marks v. United States (430 U.S. 188)                                       5
Trinova Corporation v. Michigan Department of Treasury                      5
Bouie v. City of Columbia                                                   5
United States v. Verdugo-Urquidez                                           5
                                                                           ..
Oyama v. California                                                         1
Florida Lime & Avocado Growers, Inc. v. Paul                                1
Public Employees Retirement System of Ohio v. Betts/Opinion of the Court    1
United States v. Classic                                                    1
United States v. The Nuestra Senora De Regla                                1
Name: title, Length: 2414, dtype: int64

In [8]:
 df['casename'].value_counts()

[[../]]                                                                                                             8
Schechter Poultry Corporation v. United States                                                                      6
Trinova Corporation v. Michigan Department of Treasury                                                              5
Griffin v. County School Board of Prince Edward County                                                              4
McKesson Corporation v. Division of Alcoholic Beverages and Tobacco Department of Business Regulation of Florida    4
                                                                                                                   ..
Cornell v. Green                                                                                                    1
Fitts v. McGhee                                                                                                     1
Federal Trade Commission v. Dean Foods Company          

In [9]:
 df['has_meta'].value_counts()

True     2527
False     412
Name: has_meta, dtype: int64

In [10]:
df['decided'].value_counts()


                    1199
March 5, 1990         13
June 27, 1990         10
June 21, 1990         10
May 1, 1989           10
                    ... 
February 2, 1903       1
June 1, 1925           1
Jan 7, 1935            1
March 2, 1903          1
March 6, 1968          1
Name: decided, Length: 683, dtype: int64

In [11]:
df['category'].value_counts()[:20]

United States Supreme Court decisions on treaties                    318
United States Supreme Court decisions on due process                 297
United States Supreme Court decisions on civil rights                204
United States Supreme Court decisions on the First Amendment         186
United States Supreme Court decisions on securities                  177
United States Supreme Court decisions on admiralty                   167
United States Supreme Court decisions on the Fourteenth Amendment    155
United States Supreme Court decisions on antitrust                   153
United States Supreme Court decisions on civil procedure             137
United States Supreme Court decisions on labor                       119
United States Supreme Court decisions on evidence                    118
United States Supreme Court original jurisdiction decisions          102
United States Supreme Court decisions on the Fifth Amendment          92
United States Supreme Court decisions on intellectu

# Load CL data

In [12]:
opinions_dump_dirs = [
    # '/Volumes/data/repo/data/courtlistener/ca1',
    '/Volumes/data/repo/data/courtlistener/scotus',
]

In [13]:
with open('cl_dump.pickle', 'rb') as f:
    opinions, id2oid, oid2id, texts, cits_by_source, cits_by_target, cits = pickle.load(f)
   

In [9]:
opinions = {}
id2oid = {}
oid2id = {}
cits_by_source = {}
cits_by_target = {}
cits = {}
texts = []

for dump_dir in opinions_dump_dirs:
    opinions, id2oid, oid2id, texts, cits_by_source, cits_by_target, cits = get_opinions_from_dump_dir(
        dump_dir, opinions, id2oid, oid2id, texts, cits_by_source, cits_by_target, cits, limit=0,
        extract_citations=False, text_from_html_warning=False
    )


In [14]:
print(f'Loaded {len(opinions)} CL opinions')

Loaded 64003 CL opinions


In [205]:
with open('cl_dump.pickle', 'wb') as f:
    pickle.dump((opinions, id2oid, oid2id, texts, cits_by_source, cits_by_target, cits), f)
    

In [67]:
opinions[2659301]

{'resource_uri': 'https://www.courtlistener.com:80/api/rest/v3/opinions/2659301/',
 'id': 2659301,
 'absolute_url': '/opinion/2659301/mccutcheon-v-federal-election-commn/',
 'cluster': 'https://www.courtlistener.com:80/api/rest/v3/clusters/2659301/',
 'author': None,
 'joined_by': [],
 'author_str': '',
 'per_curiam': False,
 'date_created': '2014-04-02T19:57:22.894687Z',
 'date_modified': '2017-12-06T01:21:40.569322Z',
 'type': '010combined',
 'sha1': '3069dff3fe4df0da07c53d6f6a94181ddd6e9d44',
 'page_count': 94,
 'download_url': 'http://www.supremecourt.gov/opinions/13pdf/12-536_e1pf.pdf',
 'local_path': 'pdf/2014/04/02/mccutcheon_v._federal_election_commn.pdf',
 'plain_text': "(Slip Opinion)              OCTOBER TERM, 2013                                       1\n\n                                       Syllabus\n\n         NOTE: Where it is feasible, a syllabus (headnote) will be released, as is\n       being done in connection with this case, at the time the opinion is issued.\n  

In [68]:
opinions[87245]

{'resource_uri': 'https://www.courtlistener.com:80/api/rest/v3/opinions/87245/',
 'id': 87245,
 'absolute_url': '/opinion/87245/ableman-v-booth/',
 'cluster': 'https://www.courtlistener.com:80/api/rest/v3/clusters/87245/',
 'author': 'https://www.courtlistener.com:80/api/rest/v3/people/3166/',
 'joined_by': [],
 'author_str': '',
 'per_curiam': False,
 'date_created': '2010-04-28T16:01:03Z',
 'date_modified': '2017-03-24T03:44:17.007702Z',
 'type': '010combined',
 'sha1': 'cbd8444ef7579929d6f8b18a75a5b6c2bb5a2c81',
 'page_count': None,
 'download_url': None,
 'local_path': None,
 'plain_text': '',
 'html': '<p class="case_cite">62 U.S. 506</p>\n    <p class="case_cite">21 How. 506</p>\n    <p class="case_cite">16 L.Ed. 169</p>\n    <p class="parties">STEPHEN V. R. ABLEMAN, PLAINTIFF IN ERROR,<br>v.<br>SHERMAN M. BOOTH;<br>AND THE UNITED STATES, PLAINTIFF IN ERROR,<br>v.<br>SHERMAN M. BOOTH.</p>\n    <p class="date">December Term, 1858</p>\n    <div class="prelims">\n      <p class="ind

In [69]:
a = df.iloc[1, :]
a

id                                                    706392
title                         Ableman v. Booth (62 U.S. 506)
category    United States Supreme Court decisions on slavery
has_meta                                                True
case no                                                     
casename                      Ableman v. Booth (62 U.S. 506)
decided                                        March 7, 1859
party1                                 Stephen V. R. Ableman
party2                                      Sherman M. Booth
Name: 1, dtype: object

In [70]:
match = opinions[87245]
match.keys()

dict_keys(['resource_uri', 'id', 'absolute_url', 'cluster', 'author', 'joined_by', 'author_str', 'per_curiam', 'date_created', 'date_modified', 'type', 'sha1', 'page_count', 'download_url', 'local_path', 'plain_text', 'html', 'html_lawbox', 'html_columbia', 'html_with_citations', 'extracted_by_ocr', 'opinions_cited'])

In [71]:
opinions[87245]['html']

'<p class="case_cite">62 U.S. 506</p>\n    <p class="case_cite">21 How. 506</p>\n    <p class="case_cite">16 L.Ed. 169</p>\n    <p class="parties">STEPHEN V. R. ABLEMAN, PLAINTIFF IN ERROR,<br>v.<br>SHERMAN M. BOOTH;<br>AND THE UNITED STATES, PLAINTIFF IN ERROR,<br>v.<br>SHERMAN M. BOOTH.</p>\n    <p class="date">December Term, 1858</p>\n    <div class="prelims">\n      <p class="indent">THESE two cases were brought up from the Supreme Court of the State of Wisconsin by a writ of error issued under the 25th section of the judiciary act.</p>\n      <p class="indent">The facts are stated in the opinion of the court.</p>\n      <p class="indent">They were argued by <i>Mr. Black</i> (Attorney General) for the plaintiffs in error, no counsel appearing for the defendant.</p>\n      <p class="indent">Mr. Chief Justice TANEY delivered the opinion of the court.</p>\n    </div>\n    <div class="num" id="p1">\n      <span class="num">1</span>\n      <p class="indent">The plaintiff in error in the

# Find matching CL opinions (try differnt slugs)

In [15]:
def casename_to_slug(casename):
    # casename)
    return re.sub('[^0-9a-zA-Z\s]+', '-', re.sub(r'\((.*?)\)', '', casename))\
        .lower()\
        .replace('.', '-')\
        .replace(' ', '-').replace('--', '-').strip('-')

def slug_from_opinion(opinion):
    match = re.search(r'\/([0-9]+)\/([a-z-]+)', opinion['absolute_url'])
    
    if match:
        return match.group(2)
    else:
        return

In [16]:
slug2oid = {}
slug_duplicates = set()

for oid, opinion in opinions.items():
    slug = slug_from_opinion(opinion)
    
    if slug:
        if slug in slug2oid:
            slug_duplicates.add(slug)
        slug2oid[slug] = oid
        
print(f'slug_duplicates = {len(slug_duplicates)} / {len(opinions)}')

slug_duplicates = 4810 / 64003


In [143]:
df['oid'] = ''

In [164]:
df['found'] = 0
df['oid'] = ''

not_found_idxs = set()

found = 0
found_duplicates = 0

for idx, row in df.iterrows(): # tqdm(df.iterrows())
    row2oid(idx)

```
# simple
found = 1431 / 2939
found_duplicates = 301 / 2939
not found with slug = 997
##
found = 1574 / 2939
found_duplicates = 311 / 2939
not found with slug = 997
##
found = 1589 / 2939
found_duplicates = 313 / 2939
not found with slug = 983
## combinations
found = 1993 / 2939 (67.81%)
found_duplicates = 374 / 2939
not found with slug = 1143
```

In [165]:
print(f'found = {found} / {len(df)} ({round(found/len(df), 4)*100}%)')
print(f'found_duplicates = {found_duplicates} / {len(df)}')
print(f'not found with slug = {len(not_found_idxs)}')

found = 1993 / 2939 (67.81%)
found_duplicates = 374 / 2939
not found with slug = 1143


In [166]:
# Categories for that we have the corresponding CL opinion
df[df['found'] > 0]['category'].value_counts()[:20]

United States Supreme Court decisions on treaties                    230
United States Supreme Court decisions on due process                 195
United States Supreme Court decisions on admiralty                   147
United States Supreme Court decisions on securities                  145
United States Supreme Court decisions on the First Amendment         121
United States Supreme Court decisions on antitrust                   116
United States Supreme Court decisions on the Fourteenth Amendment    114
United States Supreme Court decisions on civil rights                 95
United States Supreme Court original jurisdiction decisions           91
United States Supreme Court decisions on evidence                     90
United States Supreme Court decisions on civil procedure              90
United States Supreme Court decisions on the Fifth Amendment          75
United States Supreme Court decisions on the Fourth Amendment         40
United States Supreme Court decisions on intellectu

In [167]:
df['found'].value_counts()[:20]

1    1993
0     946
Name: found, dtype: int64

In [163]:

"""
partial party names
Bank v. Sherman Hickling => Bank v. Sherman
Bass Ratcliff Gretton v. State Tax Commission => BASS, ETC., LTD. v. Tax Comm
"""

combs = []    
query_operators = [
    #'A', 'B', 'C', 'D'
    lambda x: x,
    lambda x: x.replace('company', 'co'),
    lambda x: x.replace('-c-', '-co-'),        
    lambda x: x.replace('corporation', 'corp'),
    lambda x: x.replace('city-of-', ''),
    lambda x: x.replace('insurance', 'ins'),  
    lambda x: x.replace('vermont', 'vt'),
    lambda x: x.replace('-southern-california-gas-co', ''),
    lambda x: x.replace('-inv-', '-investment-'),
    lambda x: x.replace('administrator-of-', ''),
    lambda x: x.replace('--', '-'),      
    lambda x: x.replace('base-ball', 'baseball'),
    #lambda x: x.replace('-v-', '-inc-v-'),
    lambda x: x.replace('federal-trade-commission', 'ftc'),
    #lambda x: x[:72],
    lambda x: x.replace('-railway-co', '-r-co'),
    lambda x: x.replace('-railroad-co', '-r-co'),
    lambda x: x.replace('-texas', '-tex'),
]

for i, op in enumerate(query_operators):
    combs.append([op])
    els = [list(x) for x in itertools.combinations(query_operators, i)]
    combs.extend(els)

def get_queries(slug, combs):
    queries = set()

    for comb in combs:
        if len(comb) > 0:
            comb_q = slug
            for operator in comb:
                comb_q = operator(comb_q)
            queries.add(comb_q)
    return queries


def row2oid(idx, debug=False):
    global slug2oid, slug_duplicates, not_found_idxs, found, found_duplicates, df
    row = df.iloc[idx, :]
    
    if isinstance(row['casename'], str):
        slug = casename_to_slug(row['casename'])
    else:
        slug = casename_to_slug(row['title'])

    queries = get_queries(slug, combs)

    if debug:
        print(f'Queries: {queries}')
    for query in queries:        
        if query in slug2oid:
            found += 1
            oid = str(slug2oid[query])
            
            if df.at[idx, 'found'] == 0:  # already found
                df.at[idx, 'oid'] = oid
            else:                
                df.at[idx, 'oid'] += ',' + oid
                
            df.at[idx, 'found'] += 1
            
            if debug:
                print('Found!')
            if query in slug_duplicates:
                found_duplicates += 1
                if debug:
                    print('--- duplicate')
            return True
        else:
            not_found_idxs.add(idx)
    return False


idx = 2911
row = df.iloc[idx, :]

print(row.values)

row2oid(idx, True)

[890328 'United States v. Carolene Products Company'
 'United States Supreme Court decisions on the Commerce Clause' True ''
 'United States v. Carolene Products Company' 'April 25, 1938'
 'United States' 'Carolene Products Company' 0 '']
Queries: {'united-states-v-carolene-products-company', 'united-states-v-carolene-products-co'}
Found!


True

In [162]:
# Gold labels without CL opinion
df[df['found'] == 0]

Unnamed: 0,id,title,category,has_meta,case no,casename,decided,party1,party2,found,oid
11,746419,Bank v. Sherman Hickling,United States Supreme Court decisions on antit...,True,,Bank v. Sherman Hickling,,Bank,Sherman Hickling,0,
12,870807,Bass Ratcliff Gretton v. State Tax Commission,United States Supreme Court decisions on antit...,True,,Bass Ratcliff Gretton v. State Tax Commission,"Nov 17, 1924",Bass Ratcliff Gretton,State Tax Commission,0,
14,862426,Birge-Forbers Company v. Heye,United States Supreme Court decisions on antit...,True,,Birge-Forbers Company v. Heye,"Jan 12, 1920",Birge-Forbers Company,Heye,0,
32,850570,Continental Commercial Trust Savings Bank v. C...,United States Supreme Court decisions on antit...,False,,,,,,0,
33,752501,County of Sherman v. Simonds,United States Supreme Court decisions on antit...,True,,County of Sherman v. Simonds,"by the commissioners of Sherman county, in the...",County of Sherman,Simonds,0,
42,866799,Federal Base Ball Club of Baltimore v. Nationa...,United States Supreme Court decisions on antit...,True,,Federal Base Ball Club of Baltimore v. Nationa...,"May 29, 1922",Federal Base Ball Club of Baltimore,National League of Professional Base Ball Clubs,0,
46,804200,Friedlander v. Texas P. Railway Company,United States Supreme Court decisions on antit...,True,,Friedlander v. Texas P. Railway Company,,Friedlander,Texas P. Railway Company,0,
49,810159,Gisborn v. Charter Oak Life Insurance Company ...,United States Supreme Court decisions on antit...,True,,Gisborn v. Charter Oak Life Insurance Company ...,,Gisborn,Charter Oak Life Insurance Company of Hartford,0,
50,870800,Gorham Manufacturing Company v. State Tax Comm...,United States Supreme Court decisions on antit...,True,,Gorham Manufacturing Company v. State Tax Comm...,"Nov 17, 1924",Gorham Manufacturing Company,State Tax Commission of New Tork,0,
55,942577,Haywood v. National Basketball Association,United States Supreme Court decisions on antit...,True,,Haywood v. National Basketball Association,,Haywood,National Basketball Association,0,


In [170]:
df['oid'].value_counts()

          946
99838      14
103388      9
97980       8
105335      5
         ... 
88917       1
106660      1
112308      1
112603      1
108274      1
Name: oid, Length: 1547, dtype: int64

In [171]:
df[df['oid'] == '99838']


Unnamed: 0,id,title,category,has_meta,case no,casename,decided,party1,party2,found,oid
2654,862994,Oklahoma v. Texas (252 U.S. 372),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas,,Oklahoma,Texas,1,99838
2655,863723,Oklahoma v. Texas (253 U.S. 465),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (253 U.S. 465),,Oklahoma,Texas,1,99838
2656,864019,Oklahoma v. Texas (254 U.S. 280),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (254 U.S. 280),,Oklahoma,Texas,1,99838
2657,865594,Oklahoma v. Texas (256 U.S. 602),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (256 U.S. 602),,Oklahoma,Texas (256 U.S. 602),1,99838
2658,865178,Oklahoma v. Texas (256 U.S. 70),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (256 U.S. 70),"April 11, 1921",Oklahoma,Texas (256 U.S. 70),1,99838
2659,866112,Oklahoma v. Texas (257 U.S. 616),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (257 U.S. 616),,Oklahoma,Texas (257 U.S. 616),1,99838
2660,866650,Oklahoma v. Texas (258 U.S. 574),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (258 U.S. 574),"May 1, 1922",Oklahoma,Texas (258 U.S. 574),1,99838
2661,866970,Oklahoma v. Texas (259 U.S. 565),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (259 U.S. 565),,Oklahoma,Texas (259 U.S. 565),1,99838
2662,867535,Oklahoma v. Texas (260 U.S. 606),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (260 U.S. 606),"Jan 15, 1923",Oklahoma,Texas (260 U.S. 606),1,99838
2663,867915,Oklahoma v. Texas (261 U.S. 340),United States Supreme Court original jurisdict...,True,,Oklahoma v. Texas (261 U.S. 340),,Oklahoma,Texas (261 U.S. 340),1,99838


# Build gold df

All oids that belong to the same group

Category => OIDs
```json
{
    "Category XY": [1, 2, 3]
}
```

OID => OIDs
```json
{
    1: [2, 3],
    2: [1, 3],
    3: [2, 1],
    
}
```

In [185]:
# categories with more than 20 matching oids
cats = df[df['found'] > 0]['category'].value_counts() #[:20]
cats = list(cats.keys())

In [203]:
# Find oids for each category
cat2oids = {}
oid2relevant_oids = {}

for cat in cats:
    oids = list(set(df[(df['category'] == cat) & (df['found'] > 0)]['oid'].values)) # TODO maybe convert to int
    oids = [int(i) for i in oids]
    cat2oids[cat] = oids
    
    for oid in oids:
        # every other id except current id
        relevant_oids = oids.copy()
        relevant_oids.remove(oid)
        
        if oid in oid2relevant_oids:
            oid2relevant_oids[oid] += relevant_oids
        else:
            oid2relevant_oids[oid] = relevant_oids
        
    
len(cat2oids)

64

In [204]:
with open('cat2oids.pickle', 'wb') as f:
    pickle.dump(cat2oids, f)
    
with open('cat2oids.json', 'w') as f:
    json.dump(cat2oids, f)
    
with open('wikisource_oid2relevant_oids.pickle', 'wb') as f:
    pickle.dump(oid2relevant_oids, f)
    
with open('wikisource_oid2relevant_oids.json', 'w') as f:
    json.dump(oid2relevant_oids, f)
    