# Stats for OCB + WikiSource

- Documents
- Relevant document pairs
- Mean relevant documents
- Median relevant documents
- Max. relevant documents


In [1]:

import logging
from pathlib import Path

import pandas as pd
from IPython.core.display import display

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from docsim.environment import get_env
from docsim.experiment import Experiment


logging.basicConfig(level=logging.INFO)

unable to import 'smart_open.gcs', disabling that module


In [2]:
data_dir = Path('./data')
figures_dir = Path('./figures')
models_dir = Path('./models')

env = get_env()
top_k = 5

metric_labels = {
    'support': 'Support',
    'ret': 'Ret.',
    'rel': 'Rel.',
    'p': 'P',
    'r': 'R',
    'avg_p': 'MAP',
    'reciprocal_rank': 'MRR',
}

input_exps = [
    ('ocb', 'OpenCaseBook'),
    ('wikisource', 'WikiSource')
]


/data/experiments/mostendorff/legal-docsim/environments
Environment detected: gpu_server2 (in default.yml)


In [45]:
exps = {}
evaluate = True   # change this
gs_stats_list = []

for name, pretty_name in input_exps:

    exp = Experiment(name=name, env=env, data_dir=data_dir, pretty_name=pretty_name)

    exp.load_data()
    exp.filter_docs()
    
    common_kwargs = dict(
        doc_id2idx=exp.doc_id2idx, 
        idx2doc_id=exp.idx2doc_id,
        print_progress=True,
        tqdm_notebook=True,
        top_k=top_k,
    )
    
    
    seed_counts_df = exp.gs.df[['seed_id', 'target_id']].groupby(['seed_id']).count().reset_index()
    seed_counts_df.describe()

    gs_stats = seed_counts_df.describe().rename(columns=dict(target_id=pretty_name))

    gs_stats.loc['relevant_pairs'] = [len(exp.gs.df)]

    gs_stats_list.append(gs_stats)



INFO:docsim.experiment:Documents loaded: 1,632
INFO:docsim.experiment:Unique documents in gold standard: 1,623
INFO:docsim.experiment:Documents after filtering: 1,590 (before 1,632)
INFO:docsim.experiment:Documents after filtering: 1,378 (before 64,003)


In [49]:
stats_df = pd.concat(gs_stats_list, axis=1)

In [48]:
with pd.option_context('precision', 2):
    display(stats_df.transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,relevant_pairs
OpenCaseBook,1623.0,86.42,65.18,2.0,48.0,83.0,111.0,1590.0,140256.0
WikiSource,1540.0,130.01,82.46,1.0,88.0,113.0,194.0,616.0,200210.0


In [51]:
with pd.option_context('precision', 2):
    print(stats_df.transpose().to_latex())

\begin{tabular}{lrrrrrrrrr}
\toprule
{} &   count &    mean &    std &  min &   25\% &    50\% &    75\% &     max &  relevant\_pairs \\
\midrule
OpenCaseBook &  1623.0 &   86.42 &  65.18 &  2.0 &  48.0 &   83.0 &  111.0 &  1590.0 &        140256.0 \\
WikiSource   &  1540.0 &  130.01 &  82.46 &  1.0 &  88.0 &  113.0 &  194.0 &   616.0 &        200210.0 \\
\bottomrule
\end{tabular}

