In [None]:
import os
import json

import qgrid as q
import pandas as pd

In [None]:
# Set paths
ECHR_DATA_FILE = '../data/ECHR_metadata_from0till84k.json'
NETWORK_FILE = '../data/network.json'

## Load and merge files
Processing 84k ECHR cases scraped from HUDOC API. Merging it with the references extracted with [`jorgecarleitao/echr_network`](https://github.com/jorgecarleitao/echr_network)

### HUDOC Dataset

query: `see` [`ECHR_metadata_harvester.ipynb`](ECHR_metadata_harvester.ipynb)

file: `ECHR_metadata_from0till84k.json`

scraped: `31 August 2020 at 15:23`

In [None]:
# Load JSON
df_citation_check = pd.read_json(ECHR_DATA_FILE)

# Filter relevant columns
df_citation_check = df_citation_check[['itemid','languageisocode','appno','extractedappno','scl']]

df_citation_check.head()

In [None]:
# Languages overview
df_citation_check.groupby('languageisocode').size().sort_values(ascending=False).head()

### Network Dataset _incomplete_

query: `n/a`

file: `network.json`

scraped: `27 August 2020 at 13:50`

In [None]:
# Load JSON file
df_to_merge = pd.read_json(NETWORK_FILE)

# Pre arrange df
df_to_merge = df_to_merge.transpose()
df_to_merge['itemid'] = df_to_merge.index

# Filter relevant columns
df_to_merge = df_to_merge[['itemid','references']]
df_to_merge.head()

## Preprocessing
Remove duplicated application numbers in exractedappno. 

_Eg_ `appno=38042/06 extractedappno=38042/06;30979/96;31932/03` 

will become `appno=38042/06 extractedappno=30979/96;31932/03`

In [None]:
# Replace appno in every extractedappno with blank
df_citation_check['extractedappno'] = df_citation_check.apply(lambda row: row['extractedappno'].replace(f"{row['appno']};", ''), axis=1)
df_citation_check['extractedappno'] = df_citation_check.apply(lambda row: row['extractedappno'].replace(f"{row['appno']}", ''), axis=1)

df_citation_check.head()

## Citations diff _(scl vs extractedappno)_
Explore how frequently citations are missing when comparing the two columns in the HUDOC-metadata. Calculate whether the number of citations in the scl column exceeds the number of citations in the extractedappno column or v.v.

- If number of citations in extractedappno **>** number of citations in scl: **score=1**
- If number of citations in extractedappno **<** number of citations in scl: **score=-1**
- If number of citations in extractedappno **=** number of citations in scl: **score=0** 

In [None]:
def diff(extractedappno, scl):
    # Get the length of both lists. Return 0 if the list is empty (having only one empty element)
    l_extractedappno = 0 if extractedappno[0] == '' else len(extractedappno)
    l_scl = 0 if scl[0] == '' else len(scl)
    
    # extractedappno > scl: score=1
    if l_extractedappno > l_scl:
        return 1
    # extractedappno < scl: score=-1
    elif l_extractedappno < l_scl:
        return -1
    # extractedappno = scl: score=0 
    else:
        return 0

# Add new column with the score values. Apply diff on extractedappno and scl, spliting them by ;
df_citation_check['citation_diff_extr_vs_scl_all'] = df_citation_check.apply(lambda row: diff(row['extractedappno'].split(';'), row['scl'].split(';')), axis=1)

# Print the diffs counters
df_citation_check.groupby('citation_diff_extr_vs_scl_all').size()

## Language merge
For each application number (appno), merge the application numbers from the extractedappno (for the different languages) column.

In [None]:
# Super ugly solution, promise will find something better
def merge_extracted(appno):
    # Get all the rows with appno. List all the extractedappno, convert them to strings and split them by rows
    # eg ['21986/93;37685/10;22768/12','""', '21986/93']
    lst = (df_citation_check.query(f'appno=="{appno}"')['extractedappno']
        .to_csv(header=None, index=False)
        .strip()
        .split('\n'))
        
    # Split the strings into lists by ;
    # eg [['21986/93', '37685/10', '22768/12'], ['""'], ['21986/93']]
    groups = [group.split(';') for group in lst]
    
    # Double list comprehensions: Extract each appno from all the groups. Ignore if value is empty string
    # eg ['21986/93', '37685/10', '22768/12', '21986/93']
    apps = [app for group in groups for app in group if app != '""'] 
    
    # Remove duplicates from the final list
    # eg ['21986/93', '37685/10', '22768/12']
    return list(set(apps))

df_citation_check['extractedappno_merged'] = df_citation_check.apply(lambda row: merge_extracted(row['appno']), axis=1)
df_citation_check.head()

In [None]:
# Back out merged dataset (881s run time)
df_citation_check.to_csv('../data/export/df_citation_check_lang_merged.csv')

## Dataset merge
Merge the two datasets in order to get, check and merge the extracted references.

In [None]:
# Filter the english entries
df_citation_eng = df_citation_check.query('languageisocode == "ENG"')
df_citation_eng.head()

In [None]:
merged_df = pd.merge(df_citation_eng,df_to_merge, on='itemid')
merged_df.head()

In [None]:
#TO DO
"""Goal is to enrich the extracted citations by HUDOC with parsed citations with the method explained in https://github.com/jorgecarleitao/echr_network/. 
The necessity follows from the issue that HUDOC harvests citations, but not all, especially the ones where no application number is mentioned, 
whereas non-HUDOC scripts do not capture many of the HUDOC-extracted citations.

Preprocessing
1. Remove application numbers in exractedappno that are the same as the application number in appno (eg extractedappno=(38042/06;30979/96;31932/03) and appno=38042/06) 

Explore how frequently citations are missing when comparing the two columns in the HUDOC-metadata 
2. Create column that calculates whether the number of citations in the scl column exceeds the number of citations in the extractedappno column or v.v.
Do this for the complete df ['citation_diff_extr_vs_scl_all'] and the df with only English documents ['citation_diff_extr_vs_scl_ENG']:
- If number of citations in extractedappno > number of citations in scl: score=1
- If number of citations in extractedappno < number of citations in scl: score=-1
- If number of citations in extractedappno = number of citations in scl: score=0 

Insert missing citations in HUDOC-based dataset
3. For each application number (appno), merge the application numbers from the extractedappno (for the different languages) column.

Explore how frequently citations are missing in the HUDOC-metadata and/or in the parsed data
4. Create column that calculates whether the number of citations in the extracteappno column exceeds the number of citations in the references column based on the parsed references (and v.v.)
- If number of citations in extractedappno > number of citations in references: score=1
- If number of citations in extractedappno < number of citations in references: score=-1
- If number of citations in extractedappno = number of citations in references: score=0 

Update the extractedappno column where the parsed references include additional citations
6. Create column that copies references column. Replace itemids in references with application numbers
7. Create column named extractedappno_enhanced. Copy the values of extractedappno in there
8. If reference in references not in extractedappno_enhaned:
    - Add reference to extractedappno_enhanced.
    - Else: pass
[TO DO: manually check for samples (eg 100 where score=1, 100 where score=-1, 100 where score=0) whether scl contains decisions that are not mentioned in extractedappno and v.v. Report results]
9. Create new column ['extractedecli'], where the application numbers from extractedappno are replaced with the corresponding eclis
10. Export citation network to csv file. The network should look like this (ECLIs are abbreviated below):
    Source,Target
    ECLI:1,ECLI:2
    ECLI:1,ECLI:3
    ECLI:2,ECLI:3
    ECLI:3,ECLI:45
    ...
"""