# Save WikiSource gold standard as CSV 

In [1]:
import json
import os
import requests
import pickle
import pandas as pd
from pathlib import Path

In [2]:
data_dir = Path('./data/wikisource')

In [3]:
with open(os.path.join(data_dir, 'cat2oids.json'), 'rb') as f:
    cat2oids = json.load(f)

print(f'Categories loaded {len(cat2oids)} from WikiSource')

Categories loaded 64 from WikiSource


In [5]:
rows = []

for cat, oids in cat2oids.items():
    
    for seed_id in oids:
        # every other id except current id
        target_ids = oids.copy()
        target_ids.remove(seed_id)
        
        for target_id in target_ids:
            rows.append((seed_id, target_id, cat))

print(f'Found gold triples: {len(rows)}')

Found gold triples: 200210


In [7]:
df = pd.DataFrame(rows, columns=['seed_id', 'target_id', 'label'])
df.head()

Unnamed: 0,seed_id,target_id,label
0,86137,88695,United States Supreme Court decisions on treaties
1,86137,98239,United States Supreme Court decisions on treaties
2,86137,112574,United States Supreme Court decisions on treaties
3,86137,112247,United States Supreme Court decisions on treaties
4,86137,93298,United States Supreme Court decisions on treaties


In [8]:
df.to_csv(data_dir / 'gold.csv', index=False)

# Meta data

- word_count
- edges_count
- decision_year


In [3]:
from docsim.environment import get_env
from docsim.experiment import Experiment
from pathlib import Path

env = get_env()
data_dir = Path('./data')

exp = Experiment(name='wikisource', env=env, data_dir=data_dir)

exp.load_data()
exp.filter_docs()

unable to import 'smart_open.gcs', disabling that module


/home/mostendorff/experiments/legal-docsim/environments
Environment detected: gpu_server2 (in default.yml)


In [4]:
from collections import defaultdict

# Count graph connections
doc_id2edges_count = defaultdict(int)

for _from, _to in exp.cits: 
    if _from in exp.doc_id2idx:
        doc_id2edges_count[_from] += 1
        
    if _to in exp.doc_id2idx:
        doc_id2edges_count[_to] += 1

In [5]:
# Count words
doc_id2word_count = defaultdict(int)

for idx, text in enumerate(exp.texts): 
    if idx in exp.idx2doc_id and len(text) > 0:
        doc_id2word_count[exp.idx2doc_id[idx]] = len(text.split())

In [6]:
# Load dataframe from CSV
with open(data_dir / 'dataframe.csv', 'r') as f:
    meta_df = pd.read_csv(f, usecols=['id', 'case_name', 'absolute_url', 'category', 'text_length', 'judges', 'year_filed', 'citation_count', 'ingoing_citation_count', 'outgoing_citation_count'])

meta_df

Unnamed: 0,id,absolute_url,category,text_length,case_name,year_filed,judges,citation_count,ingoing_citation_count,outgoing_citation_count
0,118181,/opinion/118181/oncale-v-sundowner-offshore-se...,United States Supreme Court decisions on sexua...,2399,"Oncale v. Sundowner Offshore Services, Inc.",1998,Scalia,1809,23,11
1,88178,/opinion/88178/the-maggie-hammond/,United States Supreme Court decisions on admir...,9610,The Maggie Hammond,1870,Clifford,36,14,0
2,106936,/opinion/106936/beck-v-ohio/,United States Supreme Court decisions on the F...,4407,Beck v. Ohio,1964,Stewart,3565,37,26
3,88528,/opinion/88528/the-java/,United States Supreme Court decisions on admir...,3387,The Java,1872,Bradley,6,0,0
4,106170,/opinion/106170/monroe-v-pape/,United States Supreme Court decisions on civil...,32933,Monroe v. Pape,1961,Douglas,3097,135,145
...,...,...,...,...,...,...,...,...,...,...
1541,88415,/opinion/88415/edwards-v-tanneret/,United States Supreme Court decisions on civil...,1787,Edwards v. Tanneret,1871,,0,0,0
1542,118395,/opinion/118395/bush-v-gore/,United States Supreme Court decisions on elect...,19413,Bush v. Gore,2000,Per Curiam,284,1,54
1543,112632,/opinion/112632/gregory-v-ashcroft/,United States Supreme Court decisions on the C...,15019,Gregory v. Ashcroft,1991,O'Connor,805,61,68
1544,89592,/opinion/89592/colorado-company-v-commissioners/,United States Supreme Court decisions on secur...,2677,Colorado Company v. Commissioners,1877,,6,5,6


In [21]:
#meta_df.rename(columns={'year_filed': 'decision_year'}, inplace=True)
meta_df.rename(columns={'citation_count': 'cites_to_count'}, inplace=True)
meta_df.rename(columns={'case_name': 'name'}, inplace=True)
meta_df.rename(columns={'absolute_url': 'url'}, inplace=True)

In [22]:
word_counts = []
edge_counts = []

for doc_id in meta_df['id'].values:
    doc_id = str(doc_id)
    if doc_id in doc_id2word_count:
        word_counts.append(doc_id2word_count[doc_id])
    else:
        word_counts.append(0)
        
    if doc_id in doc_id2edges_count:
        edge_counts.append(doc_id2edges_count[doc_id])
    else:
        edge_counts.append(0)

meta_df['word_count'] = word_counts
meta_df['edges_count'] = edge_counts

meta_df.head()

Unnamed: 0,id,url,category,text_length,name,decision_year,judges,cites_to_count,ingoing_citation_count,outgoing_citation_count,word_count,edges_count
0,118181,/opinion/118181/oncale-v-sundowner-offshore-se...,United States Supreme Court decisions on sexua...,2399,"Oncale v. Sundowner Offshore Services, Inc.",1998,Scalia,1809,23,11,0,0
1,88178,/opinion/88178/the-maggie-hammond/,United States Supreme Court decisions on admir...,9610,The Maggie Hammond,1870,Clifford,36,14,0,9610,14
2,106936,/opinion/106936/beck-v-ohio/,United States Supreme Court decisions on the F...,4407,Beck v. Ohio,1964,Stewart,3565,37,26,4407,63
3,88528,/opinion/88528/the-java/,United States Supreme Court decisions on admir...,3387,The Java,1872,Bradley,6,0,0,0,0
4,106170,/opinion/106170/monroe-v-pape/,United States Supreme Court decisions on civil...,32933,Monroe v. Pape,1961,Douglas,3097,135,145,32933,279


In [23]:
meta_df['word_count'].value_counts()

0         168
32          4
3579        4
3307        3
2625        3
         ... 
4105        1
5423        1
136017      1
2166        1
5109        1
Name: word_count, Length: 1303, dtype: int64

In [24]:
meta_df['edges_count'].value_counts()

0      168
1      103
2       66
3       56
4       56
      ... 
151      1
155      1
157      1
162      1
386      1
Name: edges_count, Length: 168, dtype: int64

In [25]:
meta_df.to_csv(exp.exp_dir / 'meta.csv', index=False)