# Case Law Citations Extractor for EUR-LEX
Extracts case citations for all cases in EUR-LEX. This is done on the level-of-detail of the individual paragraph cited. For example, if we are extracting citations for case 62011CJ0488, then the citation 62010CJ0618: N 31 38 - 43 49 57 58 will be decomposed into the individual citations: 62010CJ0618: N31, 62010CJ0618: N38, 62010CJ0618: N39, 62010CJ0618: N40, 62010CJ0618: N41, 62010CJ0618: N42, 62010CJ0618: N43, 62010CJ0618: N49, 62010CJ0618: N57 and 62010CJ0618: N58

## Define main functions used in this notebook

### Functions: Part 1
Low-level functions for actually extracting metadata of each type for the given source case

In [479]:
# Urllib library used to query a website
from urllib.request import urlopen
# BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup
# CSV parser
import csv
# Regular expressions
import re

#s = "123123STRINGabcabc"

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def find_between_r( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""


#print find_between( s, "123", "abc" ) -> 123STRING
#print find_between_r( s, "123", "abc" ) -> STRINGabc
        
def processProcedure(piece_of_text):
    #print(piece_of_text)
    # initialise list of items to be extracted from Procedure section
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    #print(lines3)
    
    judge = ''
    advocate = ''
    
    for item in lines3:
        
        line_split = item.split(': ')
        #print(line_split[0])
        if (((line_split[0].upper().count('JUDGE') > 0) or (line_split[0].upper().count('RAPPORTEUR') > 0)) and (judge == '')):
            judge = line_split[1]
        elif ((line_split[0].upper().count('ADVOCATE') > 0) and (advocate == '')):
            advocate = line_split[1]
            
    items.append(judge)
    items.append(advocate)
    
    #print(items)
    return items    
            
def processTitle(piece_of_text):
    # initialise list of items to be extracted from Title section
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    #print(lines3)
    
    line_split = lines3[0].split('.')
    line_split2 = [x for x in line_split if x]
    line_split3 = []
    for thing in line_split2:
        if (thing != '' and thing != ' '):
            line_split3.append(thing)
    
    #print(line_split3)
    
    ruling_title = line_split3[0]
    chamber = find_between_r(line_split3[0], '(', ')')
    ruling_name = line_split3[1]
    
    items.append(ruling_title)
    items.append(chamber)
    items.append(ruling_name)
    
    if (len(line_split3) == 5):
        for k in range(2, len(line_split3)-1):
            items.append(line_split3[k])
    else:
        items.append('Check EUR-LEX webpage')
        items.append('Check EUR-LEX webpage')
        
    case_label = line_split3[len(line_split3)-1]
    items.append(case_label)
    ecli = lines3[len(lines3)-1]
    items.append(ecli)
        
    #print(items)
    return items       
        
        
def processMisc(piece_of_text):
    # initialise list of items to be extracted from Miscellaneous section (Country)
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    country = ''
    
    for item in lines3:
        line_split = item.split(': ')
        if (line_split[0].upper().count('COUNTRY') > 0):
            country = line_split[1]
            
    items.append(country)
    
    #print(items)
    return items    
        
        
def processDates(piece_of_text):
    # initialise list of items to be extracted from Dates section (lodged and document dates)
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    dates = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            dates.append(thing)
            
    lodge_date = ''
    doc_date = ''
    
    for item in dates:
        date_split = item.split(': ')
        if (date_split[0].upper().count('LODGED') > 0):
            lodge_date = date_split[1]
        else:
            doc_date = date_split[1]
            
    items.append(lodge_date)
    items.append(doc_date)
    
    #print(items)
    return items

### Functions: Part 2
1) Low-level function for actually extracting the citations for a given source case, 2) function for extracting other subject matters related to a case, and 3) function to write data to file

In [480]:
# Urllib library used to query a website
from urllib.request import urlopen
# BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup
# CSV parser
import csv
# Regular expressions
import re

# Write data (citations, metadata or subjects) to file
def writeToFile(subjectMatterCode, rows, datatype):
    with open('../data/judgements/'+datatype+'/'+subjectMatterCode+'_judgements_'+datatype+'.csv', 'a', newline='', encoding='utf-8') as csvfile:
        # Open file for writing
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        # Write each data row to file
        # Check if any element of list is also a list
        if (any(isinstance(el, list) for el in rows)):
            for row in rows:
                writer.writerow(row)
        else:
            writer.writerow(rows)

# Extract other subject matters for this source case (process the Classification section of the page)
def processClassifications(subjectMatterCode, piece_of_text, celexNumber):
    subjectMatters = find_between_r(piece_of_text, "Subject matter:", "Case law directory code:")
    splitStr = subjectMatters.split('\n')
    splitStr2 = [x for x in splitStr if x]
    subjectMattersFinal = []
    for thing in splitStr2:
        if (thing != '' and thing != ' '):
            subjectMattersFinal.append(thing)
    
    datarows = []
    for item in subjectMattersFinal:
        datarow = []
        datarow.append(celexNumber)
        tmp = item.replace(',', ';')
        tmp2 = tmp.replace('|', '')
        datarow.append(tmp2)
        datarows.append(datarow)
            
    writeToFile(subjectMatterCode, datarows, 'subjects')
        
# Filter out case citations that we are not interested in
def decomposeAndValidateCitation(citation, list_item):
    #print('citation: ' + str(citation))
    # Correct citations array
    correct_citations = []
    citation = citation.replace(" ", "")
    # Check if citation is a case (not a legislation or directive etc.)
    if (citation) and (citation[0] == '6' and citation[5] == 'C') and (citation[6] == 'J' or citation[6] == 'O'):
        # It is a valid citation, now decompose it
        # Part A: clean the last part of the citation e.g. the '-N10-14' part in '61980CJ0100-N10-14' -> '61980CJ0100: N10,
        # 61980CJ0100: N11, 61980CJ0100: N12, 61980CJ0100: N13, 61980CJ0100: N14'
        #print('----------------------------------------------------------')        
        #print() 
        #print('Citation: ' + str(citation))
        part_to_clean = citation[11:]
        if (part_to_clean.count(',') > 0):
            print('WTF')
            part_to_clean = part_to_clean.replace(',', ' ')
        #print('Part to clean: ' + str(part_to_clean))
        dash_count = part_to_clean.count('-')
        n_count = part_to_clean.count('N')
        if ((dash_count > 1) or (n_count > 1)):
            if (dash_count > 1):
                new_part_to_clean = part_to_clean[1:]
                #print('To Clean: ' + str(new_part_to_clean))
                guess = "-"
                occurrences = new_part_to_clean.count(guess)
                indices = [i for i, a in enumerate(new_part_to_clean) if a == guess]
                #print('Dashes: ' + str(indices))
                
                split_part = new_part_to_clean.split('-')
                lhs = split_part[0]
                rhs = split_part[1]
                number1 = re.sub("[^0-9]", "", lhs)
                number2 = re.sub("[^0-9]", "", rhs)
                for x in range(int(number1), int(number2)+1):
                    current_correct_citation = str(citation[:11]) + ': N' + str(x)
                    correct_citations.append(current_correct_citation)
            else:
                # It must be the case that there are two N's and a :
                # Remove trailing - and split on :
                new_part_to_clean = part_to_clean[1:].split(':')
                current_correct_citation = str(citation[:11]) + ': ' + str(new_part_to_clean[0])
                correct_citations.append(current_correct_citation)
                rhs = new_part_to_clean[1]
                rhs_numbers = rhs.split(' ')
                for rhs_number in rhs_numbers:
                    tmp = re.sub("[^0-9]", "", rhs_number)
                    if (len(tmp) != 0):
                        current_correct_citation = str(citation[:11]) + ': N' + str(rhs_number)
                        correct_citations.append(current_correct_citation)
        else:
            split_part = citation.split('-')
            #print('len split: ' + str(len(split_part)))
            if (len(split_part) > 1):
                correct_citation = str(split_part[0]) + ': ' + str(split_part[1]) 
                #print('Correct citation: ' + str(correct_citation))
                correct_citations.append(correct_citation)
            
        # Part B: add other paragraphs 
        # E.g. '61980CJ0100-N10-14: N 142' should include Part A decompositions + '61980CJ0100: N142'
        #print('another: ' + str(citation))
        test = list_item.text
        string_result = test.split('\n')
        for tmp in string_result:
            if (tmp.find(str(citation) + ':') != -1):
                #print('Part B: ' + str(tmp))
                split_again = tmp.split(':')
                stuff_to_clean = split_again[1]
                
                stuff_to_clean = stuff_to_clean.replace('-', ' - ')
                stuff_to_clean = re.sub(' +',' ',stuff_to_clean)
                #print('Stuff to clean: ' + str(stuff_to_clean))
                if (stuff_to_clean.count(',') > 0):
                    print('WTF2')
                    stuff_to_clean = stuff_to_clean.replace(',', ' ')
                #print('-------------------------')
                dash_count2 = 0
                dash_count2 = stuff_to_clean.count('-')
                if (dash_count2 > 0):
#                     print('To Clean: ' + str(stuff_to_clean))
#                     guess = "-"
#                     occurrences = stuff_to_clean.count(guess)
#                     indices = [i for i, a in enumerate(stuff_to_clean) if a == guess]
#                     print('Dashes: ' + str(indices))
                    
                     
                            
#                     print('After cleaning: ' + str(stuff_to_clean))        
                    
#                     split_string_dash = stuff_to_clean.split('-')
#                     lhs = split_string_dash[0]
#                     split_lhs = lhs.split(' ')
#                     split_lhs = list(filter(None, split_lhs))
#                     rhs = split_string_dash[1]
#                     split_rhs = rhs.split(' ')
#                     split_rhs = list(filter(None, split_rhs))
#                     last_num_lhs = split_lhs[len(split_lhs)-1]
#                     first_num_rhs = split_rhs[0]
            
#                     for x in range(int(last_num_lhs), int(first_num_rhs)+1):
#                         current_correct_citation = str(citation[:11]) + ': N' + str(x)
#                         correct_citations.append(current_correct_citation)
            
                    split_string_space = stuff_to_clean.split(' ')
                    for idx, val in enumerate(split_string_space):
                        if (val == '-'):
                            num1 = int(split_string_space[idx-1])
                            num2 = int(split_string_space[idx+1])
                            for x in range(num1, num2+1):
                                current_correct_citation = str(citation[:11]) + ': N' + str(x)
                                #print ("Current correct citation: " + str(current_correct_citation))
                                correct_citations.append(current_correct_citation)
                        else:
                            tmp = re.sub("[^0-9]", "", val)
                            if (len(tmp) != 0):
                                current_correct_citation = str(citation[:11]) + ': N' + str(val)
                                #print ("Current correct citation: " + str(current_correct_citation))
                                correct_citations.append(current_correct_citation)

                else:
                    split_string = stuff_to_clean.split(' ')
                    for num in split_string:
                        tmp = re.sub("[^0-9]", "", num)
                        if (len(tmp) != 0):
                            #print(num)
                            current_correct_citation = str(citation[:11]) + ': N' + str(num)
                            #print ("Correct citation: " + str(current_correct_citation))
                            correct_citations.append(current_correct_citation)
                    #print('-------------------------')
   
    return correct_citations

### Functions: Part 3
Main calling functions for extracting the citations and metadata for a given source case 

In [481]:
# Extract citations for the case given the BeautifulSoup format of it's HTML page
def extractCitations(subjectMatterCode, soup_judgement_page, celexNumber):
    # Citations array
    citations = []
    # Get all list items in this web page (the citations are in one of the list items in the HTML source)
    li_results = soup_judgement_page.find_all('li')
    # loop through items until you find the citation list item
    for result in li_results:
        # Check if this is the list item that lists all the citations for this judgement
        if (result.text[:11] == 'Instruments'):
            # If it is, loop through each, extract the citations, and write them to csv
            for link in result.find_all('a'):
                #if there is an href attribute
                if (link.has_attr('href')):
                    #extract the case number from the href
                    this_citation = link.text
                    #print("THIS CITATION:" + str(this_citation))
                    decomposedCitations = decomposeAndValidateCitation(this_citation, result)
                    # Remove empty items
                    decomposedCitations2 = list(filter(None, decomposedCitations))
                    for citation in decomposedCitations2:
                        if citation not in citations:
                            #print("Final correct citation: " + str(citation))
                            citations.append(citation)
                            
    # Initialise list of new data rows             
    datarows = []
    
    # For each citation
    for item in citations:
        # Initialise a new data row
        datarow = []
        # Split current citation into case code and paragraph number (e.g. 62008CJ0040: N 44 -> '62008CJ0040' + 'N44')
        split_citation = item.split(': ')
        # Add source case code to new data row
        datarow.append(celexNumber)
        # Add citation case code to new data row
        datarow.append(split_citation[0])
        # Add paragraph number to new data row
        datarow.append(split_citation[1])
        # Add new data row to the total data rows
        datarows.append(datarow)
     
    # Write total data rows to citations file
    writeToFile(subjectMatterCode, datarows, 'citations')
    
def findSectionType(result):
    results2 = result.find_all("div")
    for result2 in results2:
        if result2.get('class') is not None:
            if ((result2['class'][0]).count('boxTitle') == 1):
                return result2.text
    return ''

# Extract Metadata for case given the BeautifulSoup format of it's HTML page
def extractMetadata(subjectMatterCode, soup_judgement_page, celexNumber):
    # find all divs of class 'box'
    div_results = soup_judgement_page.find_all("div", {"class": "box"})
    # sections
    sections = []
    # datarow to write to file
    datarow = []
    datarow.append(celexNumber)
    # for each div of class 'box'
    for result in div_results:
        results2 = result.find_all("div")
        index = 0
        sectionType = findSectionType(result)
        sectionType = sectionType.replace(" ", "")
        #print("sectionType: " + str(sectionType))
        if (sectionType.count("Titleandreference") > 0) or (sectionType.count("Dates") > 0) or (sectionType.count("Procedure") > 0) or (sectionType.count("Classifications") > 0) or (sectionType.count("Miscellaneousinformation") > 0):
            #print("sectionType2: " + sectionType)
            # for each div inside 
            for result2 in results2:
                if result2.get('class') is not None:
                    if ((result2['class'][0]).count('tabContent') == 1):
                        if (sectionType.count("Titleandreference") > 0):
                            # 1. Title and reference
                            # Chamber, ruling name, ruling content, case label
                            #print(result2.text)
                            title = processTitle(result2.text)
                            datarow.extend(title)
                            
                        elif (sectionType.count("Dates") > 0):
                            # 2. Dates
                            # Date document, date lodged
                            #print(result2.text)
                            dates = processDates(result2.text)
                            datarow.extend(dates)
                            
                        elif (sectionType.count("Classifications") > 0):
                            #print(result2.text)
                            # 3. Classifications
                            # 3a. Subject matters
                            processClassifications(subjectMatterCode, result2.text, celexNumber)
                            
                        elif (sectionType.count("Miscellaneousinformation") > 0):
                            # 4. Misc
                            # Country
                            misc = processMisc(result2.text)
                            datarow.extend(misc)
                            
                        elif (sectionType.count("Procedure") > 0):
                            # 5. Procedure
                            # Judge-Rapporteur, Advocate General
                            procedure = processProcedure(result2.text)
                            datarow.extend(procedure)
    
    # Clean datarow items of all commas within each item
    cleaned_row = []
    for item in datarow:
        tmp = item.replace(',', ';')
        cleaned_row.append(tmp)
        
    cleaned_row.append(subjectMatterCode)
    #print(cleaned_row)
    #Write metadata row for source case to file
    writeToFile(subjectMatterCode, cleaned_row, 'metadata')


### Functions: Part 4
Functions for processing a single case and processing all cases within a main case subject

In [482]:
from math import ceil
import json

# Process individual case given CELEX number
def processCase(subjectMatterCode, celexNumber):
    print("Source: " + str(celexNumber))
    #print('-------------------------')
    # URL prefix for a judgement on EUR-LEX
    result_url_prefix = "https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:"
    # URL for this particular judgement
    judgement_url = result_url_prefix + str(celexNumber)
    # Open the page
    judgement_page = urlopen(judgement_url)
    # Convert it to BeautifulSoup format 
    soup_judgement_page = BeautifulSoup(judgement_page, "lxml")
    # Get Metadata
    extractMetadata(subjectMatterCode, soup_judgement_page, celexNumber)
    # Get citations
    extractCitations(subjectMatterCode, soup_judgement_page, celexNumber)
    #print()

 # Process cases for a particular subject matter code
def processCases(subjectMatterCode):
    # URL prefix and suffix for judgements search results (url = prefix_url + subject matter code + suffix_url)
    prefix_url = "http://eur-lex.europa.eu/search.html?searchEq=true&qid=1524797649507&DB_TYPE_OF_ACT=judgment&CASE_LAW_SUMMARY=false&DTS_DOM=EU_LAW&CT_CODED="
    suffix_url = "&typeOfActStatus=JUDGMENT&type=advanced&lang=en&SUBDOM_INIT=EU_CASE_LAW&DTS_SUBDOM=EU_CASE_LAW"
    # Get the URL for all cases about this subject matter code
    url = prefix_url + subjectMatterCode + suffix_url
    # Open the URL
    url_page = urlopen(url)
    # Parse the HTML in the page, and store them in Beautiful Soup format using the 'lxml' parser
    soup_url_page = BeautifulSoup(url_page, "lxml")
    # The search results page displays 10 results at a time. 
    # Get the total number of 10-result pages for this subject matter.    
    j_onsubmit = soup_url_page.find('form', id='pagingForm').get('onsubmit')
    j_onsubmit = j_onsubmit.replace(" ", "")
    j_number = 1
    if (j_onsubmit):
        j_numberStr = j_onsubmit.split(",")[1] 
        j_numberStr = j_numberStr.replace(")", "");
        j_numberStr = j_numberStr.replace(" ", "");
        # Final number of result pages
        j_number = int(j_numberStr)
    else:
        j_scripts = soup_url_page.find_all('script', type='application/json')
        for script in j_scripts:
            print("script: " + script.text)
            script_text = json.loads(script.text)
            if 'search' in script_text:
                if (script_text['search']):
                    tmp = script_text['search']
                    num = tmp['count']
                    print("yay!: " + str(num))
                    if ((num/10) > 1):
                        j_number = ceil(num/10)
                
    
    print()
    print("total pages: " + str(j_number))
    print()
    # Get each result item tag (thats where the metadata for each case is found - including the CELEX number)
    j_results_on_first_page = soup_url_page.find_all('td', class_='leftMetadata')
            
    print()
    print("Page 1/" + str(j_number))
    print()
    # Process the 10 cases on the first page of the results
    for result in j_results_on_first_page:
        for ul in result.find_all('ul'):        
            for li in ul.find_all('li'):
                if (li.text[:13] == "CELEX number:"):
                    celex = li.text[14:]
                    if (celex[0] == '6' and celex[5] == 'C') and (celex[6] == 'J' or celex[6] == 'O'):
                        processCase(subjectMatterCode, celex)
    
    if (j_number > 1):
        # Process the other cases from Page 2 of results onwards
        for x in range(2, j_number+1):
            print()
            print("Page " + str(x) + "/" + str(j_number))
            print()
            # Get URL of Page x of results
            current_judgements_result_page_url = url + '&page=' + str(x) 
            # Open URL of Page x of results
            current_judgements_result_page = urlopen(current_judgements_result_page_url)
            # Store the HTML form of this page in BeautifulSoup format
            soup_current_judgements_page = BeautifulSoup(current_judgements_result_page, "lxml")
            # Get each result item tag (thats where the metadata for each case is found - including the CELEX number)
            results_on_page_x = soup_current_judgements_page.find_all('td', class_='leftMetadata')
            #Find the 10 cases on this page
            for result in results_on_page_x:
                for ul in result.find_all('ul'):
                    for li in ul.find_all('li'):
                        if (li.text[:13] == "CELEX number:"):
                            celex = li.text[14:]
                            if (celex[0] == '6' and celex[5] == 'C') and (celex[6] == 'J' or celex[6] == 'O'):
                                processCase(subjectMatterCode, celex)
    
    
    

### Main Procedure: Get the list of EUR-LEX subject matter codes from file and process the cases for each
Note: we have a list of all the EUR-LEX subject matters in "../data/SubjectMatterCodes.tsv" 

In [483]:
# CSV parser
import csv

# Array storing EUR-LEX subject matter codes
subjectMatterCodes = []

# Transfer codes from TSV file to array
with open('../data/subject_matters_sorted.csv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter=',')
    for row in reader:
        subjectMatterCodes.append(row[0])


#processCase('FIN', '62000CJ0011')
# For each subject matter code in EUR-LEX
for subjectMatterCode in subjectMatterCodes:
    print(subjectMatterCode)
    processCases(subjectMatterCode)
    print()


Source: 62000CJ0011
sectionType: Titleandreference





sectionType2: Titleandreference





sectionType: Languagesandformatsavailable





sectionType: Multilingualdisplay





sectionType: Dates





sectionType2: Dates





sectionType: Procedure





sectionType2: Procedure







Applicant: COMM, IC

Defendant: BCE, IC

Judge-Rapporteur: La Pergola
Advocate General: Jacobs


sectionType: Doctrine





sectionType: Relationshipbetweendocuments





sectionType: Text





sectionType: 
