### Benchmark case law extraction against vCite

In [1]:
import json
import pandas as pd

In [3]:
df = pd.read_json ('benchmark.json')
df.head()

Unnamed: 0,filename,rule_id,citation
0,2012-ewhc-90219-costs.xml,ewhc_ch,[2007] EWHC 2733 (Ch)
1,2012-ewhc-90219-costs.xml,ewhc_qb,[2008] EWHC 497 (QB)
2,2012-ewhc-90219-costs.xml,ewca_civ,[2003] EWCA Civ 1766
3,2012-ewhc-90219-costs.xml,ewca_civ,[2005] EWCA Civ 1206
4,2012-ewhc-90219-costs.xml,ewhc_qb,[2003] EWHC 3127 (QB)


In [5]:
df_vlex = pd.read_csv('tna-sample/vcite-sample-caselaw-analysis.csv')
# exclude judgment that wasn't processed
df_vlex = df_vlex[df_vlex.Filename!='2021-ewhc-65-fam.xml']
# prep Filename column for merging
df_vlex.Filename = df_vlex.Filename.str.split('/').apply(lambda x: x[-1])
# exclude columns not of interest
df_vlex = df_vlex[['Filename', 'Content', 'Error', 'ExtractedSeries']]
# exclude non-UK case law
df_vlex = df_vlex[~df_vlex.Content.isin(['HCA', 'NZLR', 'N.Z.L.R', 'N.Y.', 'U.S.', 'US', 'A.D.', 'F. Supp.', 'NSWCA', 'NSWLR', 'NSWSC', 'CLC', 'CLR', 'SASR',
'1F487', 'say', 'Jenkins', 'Cal.'])]

df_vlex.head()

Unnamed: 0,Filename,Content,Error,ExtractedSeries
0,2012-ewhc-90219-costs.xml,[2007] EWHC 2733 (Ch),,EWHC Ch
1,2012-ewhc-90219-costs.xml,[2008] EWHC 497 (QB),,EWHC QB
2,2012-ewhc-90219-costs.xml,[2003] EWHC 3127 (QB),,EWHC QB
3,2012-ewhc-90219-costs.xml,[2011] EWHC 177 (QB),,EWHC QB
4,2012-ewhc-90223-costs.xml,(2007) EWHC 665 (Ch),,EWHC Ch


In [11]:
# how many files did we process vs vLex

our_files = df.filename.unique().tolist()
vlex_files = df_vlex.Filename.unique().tolist()

print("We processed {0} files.".format(len(our_files)))
print("vLex processed {0} files.".format(len(vlex_files)))

We processed 388 files.
vLex processed 390 files.


In [14]:
print("Files we are missing that vLex processed:")
print(list(set(vlex_files)-set(our_files)))
print("---")
print("Files vLex are missing that we processed:")
print(list(set(our_files)-set(vlex_files)))

Files we are missing that vLex processed:
['2006-ewhc-1187-tcc.xml', '2019-ewca-1402-civ.xml', '2021-ewhc-794-tcc.xml', '2009-ewhc-1274-admlty.xml', '2021-ewhc-65-fam.xml']
---
Files vLex are missing that we processed:
['2021-ewhc-5-costs.xml', '2014-ewhc-1195-mercantile.xml', '2021-ukpc-24.xml']


#### Benchmarking Stats

In [15]:
# build comparison dictionary that collects stats on number of citations extracted by both, only by vLex, only by MxT and number of extractions missed by vLex/MxT
comparison_dict = {'filename': [], 'num_both_extracted': [], 'num_mdr_extracted': [], 'num_vlex_extracted': [], 'num_mdr_not_extracted': [], 'num_vlex_not_extracted': []}

for i in vlex_files:
  our_df = df[df.filename==i]
  vlex_df = df_vlex[df_vlex.Filename==i]

  comparison_df = our_df.merge(vlex_df, indicator=True, how='outer', left_on=['filename', 'citation'], right_on=['Filename', 'Content'])
  mdr_extracted = len(comparison_df[comparison_df._merge=='both'])+len(comparison_df[comparison_df._merge=='left_only'])
  vlex_extracted = len(comparison_df[comparison_df._merge=='both'])+len(comparison_df[comparison_df._merge=='right_only'])
  same_extraction = len(comparison_df[comparison_df._merge=='both'])
  mdr_not_extracted = len(comparison_df[comparison_df._merge=='right_only'])
  vlex_not_extracted = len(comparison_df[comparison_df._merge=='left_only'])

  comparison_dict['filename'].append(i)
  comparison_dict['num_both_extracted'].append(same_extraction)
  comparison_dict['num_mdr_extracted'].append(mdr_extracted)
  comparison_dict['num_vlex_extracted'].append(vlex_extracted)
  comparison_dict['num_mdr_not_extracted'].append(mdr_not_extracted)
  comparison_dict['num_vlex_not_extracted'].append(vlex_not_extracted)

In [16]:
# comparison dataframe
benchmark_outcome = pd.DataFrame.from_dict(comparison_dict)
benchmark_outcome['benchmark_percent'] = (benchmark_outcome.num_mdr_extracted)/(benchmark_outcome.num_vlex_extracted)
benchmark_outcome.head(10)

Unnamed: 0,filename,num_both_extracted,num_mdr_extracted,num_vlex_extracted,num_mdr_not_extracted,num_vlex_not_extracted,benchmark_percent
0,2012-ewhc-90219-costs.xml,6,7,6,0,1,1.166667
1,2012-ewhc-90223-costs.xml,6,6,11,5,0,0.545455
2,2008-ewhc-90111-costs.xml,12,14,12,0,2,1.166667
3,2008-ewhc-90105-costs.xml,16,19,18,2,3,1.055556
4,2008-ewhc-90110-costs.xml,29,30,37,8,1,0.810811
5,2008-ewhc-90107-costs.xml,7,7,13,6,0,0.538462
6,2009-ewhc-90154-costs.xml,9,12,9,0,3,1.333333
7,2009-ewhc-90144-costs.xml,5,6,6,1,1,1.0
8,2009-ewhc-90133-costs.xml,5,5,5,0,0,1.0
9,2011-ewhc-90208-costs.xml,13,13,13,0,0,1.0


In [22]:
# How many citations did MxT vs vLex extract?
print("MxT extracted {0} case law citations.".format(benchmark_outcome.num_mdr_extracted.sum()))
print("vLex extracted {0} case law citations.".format(benchmark_outcome.num_vlex_extracted.sum()))
print("---")
print("MxT extracted on average {0} % of citations vCite found.".format(round(benchmark_outcome.benchmark_percent.mean(), 2)*100))
print("---")
print("vLex extracted more case law citations in {0} documents.".format(len(benchmark_outcome[benchmark_outcome.num_mdr_extracted < benchmark_outcome.num_vlex_extracted])))
print("MxT extracted more case law citations in {0} documents.".format(len(benchmark_outcome[benchmark_outcome.num_mdr_extracted > benchmark_outcome.num_vlex_extracted])))

MxT extracted 18352 case law citations.
vLex extracted 18593 case law citations.
---
MxT extracted on average 98.0 % of citations vCite found.
---
vLex extracted more case law citations in 173 documents.
MxT extracted more case law citations in 160 documents.


#### Compare citations extracted judgment by judgment with indicator if vCite and/or MxT extracted

!! This is not very elegant !!

In [76]:
extracted_series = []
citations = []

for i in vlex_files:
  our_df = df[df.filename==i]
  vlex_df = df_vlex[df_vlex.Filename==i]

  placeholder = our_df.merge(vlex_df, indicator=True, how='outer', left_on=['filename', 'citation'], right_on=['Filename', 'Content'])
  #vLex_only.append(placeholder[placeholder._merge=='right_only'].Content.tolist())
  extracted_series.append(placeholder[placeholder._merge=='right_only'].ExtractedSeries.tolist())
  citations.append(placeholder[placeholder._merge=='right_only'].Content.tolist())

In [64]:
flat_series = []
flat_citations = []

# iterating over the data
for item in extracted_series:
    # appending elements to the flat_list
    flat_series += item

# iterating over the data
for item in citations:
    # appending elements to the flat_list
    flat_citations += item

vLex_only = {'ExtractedSeries': flat_series,
            'Citation': flat_citations}

In [68]:
vLex_only_df = pd.DataFrame.from_dict(vLex_only)
#vLex_only_df.head()

vLex_only_df.to_csv("missed_rules.csv")