In [None]:
import requests
import pandas as pd
import time
from datetime import datetime
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'


def read_echr_metadata(start_id=0, end_id=None, fields=None, verbose=True):
    """
    Read ECHR metadata into a Pandas DataFrame.
    :param start_id: result index to start search from
    :param end_id: result index to end search at (default None fetches all results)
    :param fields: list of meta attribute names to return (default None fetches all attributes)
    :param verbose: bool whether or not to print fetched URLs
    :return:
    """
    if not fields:
        fields = ['itemid',
              'applicability',
              'application',
              'appno',
              'article',
              'conclusion',
              'decisiondate',
              'docname',
              'documentcollectionid',
              'documentcollectionid2',
              'doctype',
              'doctypebranch',
              'ecli',
              'externalsources',
              'extractedappno',
              'importance',
              'introductiondate',
              'isplaceholder',
              'issue',
              'judgementdate',
              'kpdate',
              'kpdateAsText',
              'kpthesaurus',
              'languageisocode',
              'meetingnumber',
              'originatingbody',
              'publishedby',
              'Rank',
              'referencedate',
              'reportdate',
              'representedby',
              'resolutiondate',
              'resolutionnumber',
              'respondent',
              'respondentOrderEng',
              'rulesofcourt',
              'separateopinion',
              'scl',
              'sharepointid',
              'typedescription',
              'nonviolation',
              'violation']
    META_URL = 'http://hudoc.echr.coe.int/app/query/results' \
      '?query=(contentsitename=ECHR) AND ' \
               '(documentcollectionid2:"JUDGMENTS" OR documentcollectionid2:"COMMUNICATEDCASES") AND' \
               '(languageisocode:"ENG")' \
      '&select={select}' + \
      '&sort=itemid Ascending' + \
      '&start={start}&length={length}'
    META_URL = META_URL.replace(' ', '%20')
    META_URL = META_URL.replace('"', '%22')
    #example url: "https://hudoc.echr.coe.int/app/query/results?query=(contentsitename=ECHR)%20AND%20(documentcollectionid2:%22JUDGMENTS%22%20OR%20documentcollectionid2:%22COMMUNICATEDCASES%22)&select=itemid,applicability,application,appno,article,conclusion,decisiondate,docname,documentcollectionid,%20documentcollectionid2,doctype,doctypebranch,ecli,externalsources,extractedappno,importance,introductiondate,%20isplaceholder,issue,judgementdate,kpdate,kpdateAsText,kpthesaurus,languageisocode,meetingnumber,%20originatingbody,publishedby,Rank,referencedate,reportdate,representedby,resolutiondate,%20resolutionnumber,respondent,respondentOrderEng,rulesofcourt,separateopinion,scl,sharepointid,typedescription,%20nonviolation,violation&sort=itemid%20Ascending&start=0&length=2"

    # get total number of results:
    url = META_URL.format(select=','.join(fields),start=0,length=1)
    r = requests.get(url)
    resultcount = r.json()['resultcount']
    if not end_id:
        end_id = resultcount

    end_id = start_id + end_id
    data = []
    if start_id+end_id > 500: #HUDOC does not allow fetching more than 500 items at the same time
        for i in range(start_id,end_id,500):
            print(" - Fetching information from cases {} to {}.".format(i, i+500))
            
            # Fromat URL based on the incremented index
            url = META_URL.format(select=','.join(fields),start=i,length=500)
            if verbose:
                print(url)
            r = requests.get(url)
            
            # Get the results list
            temp_dict = r.json()['results']
            
            # Get every doc from the results list
            for result in temp_dict:
                data.append(result['columns'])
    else:
        # Format URL based on start and length
        url = META_URL.format(select=','.join(fields),start=start_id,length=end_id)
        if verbose:
            print(url)
        r = requests.get(url)
        
        # Get the results list
        temp_dict = r.json()['results']

        # Get every doc from the results list
        for result in temp_dict:
            data.append(result['columns'])
    
    return pd.DataFrame.from_records(data), resultcount

start = time.time()
n = 10
df, resultcount = read_echr_metadata(
    start_id=0,
    #end_id=n,
    fields=['itemid', 'documentcollectionid2', 'languageisocode'],
    verbose=False)

print(f'\n{len(df)}/{resultcount} items collected. Time taken: {time.time() - start} s')

# Print column labels to have an overview
print(list(df.columns))

 - Fetching information from cases 0 to 500.
 - Fetching information from cases 500 to 1000.
 - Fetching information from cases 1000 to 1500.
 - Fetching information from cases 1500 to 2000.
 - Fetching information from cases 2000 to 2500.
 - Fetching information from cases 2500 to 3000.
 - Fetching information from cases 3000 to 3500.
 - Fetching information from cases 3500 to 4000.
 - Fetching information from cases 4000 to 4500.
 - Fetching information from cases 4500 to 5000.
 - Fetching information from cases 5000 to 5500.
 - Fetching information from cases 5500 to 6000.
 - Fetching information from cases 6000 to 6500.
 - Fetching information from cases 6500 to 7000.
 - Fetching information from cases 7000 to 7500.
 - Fetching information from cases 7500 to 8000.
 - Fetching information from cases 8000 to 8500.
 - Fetching information from cases 8500 to 9000.
 - Fetching information from cases 9000 to 9500.
 - Fetching information from cases 9500 to 10000.
 - Fetching information 

In [9]:
# FILTER ENGLISH DOCUMENTS
# Print value counts before filtering
print("Document types BEFORE filtering:", "\n", df.documentcollectionid2.value_counts(), "\n")
# Select judgments.value_counts()
df_filtered = df.loc[df['documentcollectionid2'].isin(['CASELAW;JUDGMENTS;CHAMBER;ENG', 'CASELAW;JUDGMENTS;GRANDCHAMBER;ENG'])]
# Select documents in the English language
df_filtered = df_filtered.loc[df_filtered['languageisocode'] == 'ENG']
# Select years (if desired)
#df_filtered = df_filtered.loc[df_filtered['year'] >= 2000]
# Print value counts after filtering
print("Document types AFTER filtering:", "\n", df_filtered.documentcollectionid2.value_counts(), "\n")

# Store filtered
filename = f'ECHR_metadata_english_{datetime.now().strftime("%d-%m-%Y_%H-%M")}.csv'
df_filtered.to_csv('../../data/' + filename, index=False)
print('Metadata saved as ' + filename)

Document types BEFORE filtering: 
 CASELAW;JUDGMENTS;CHAMBER;ENG         19558
CASELAW;COMMUNICATEDCASES;ENG          9460
CASELAW;JUDGMENTS;COMMITTEE;ENG        3966
CASELAW;JUDGMENTS;GRANDCHAMBER;ENG      484
Name: documentcollectionid2, dtype: int64 

Document types AFTER filtering: 
 CASELAW;JUDGMENTS;CHAMBER;ENG         19558
CASELAW;JUDGMENTS;GRANDCHAMBER;ENG      484
Name: documentcollectionid2, dtype: int64 

Metadata saved as ECHR_metadata_english_08-07-2021_13-43.csv


In [2]:
df

Unnamed: 0,itemid,documentcollectionid2,languageisocode
0,001-100003,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
1,001-100006,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
2,001-100009,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
3,001-100011,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
4,001-100015,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
...,...,...,...
34192,001-99953,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
34193,001-99966,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
34194,001-99992,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
34195,001-99994,CASELAW;JUDGMENTS;CHAMBER;ENG,ENG
