# 1. Get Judgement and Order IDs 
This part gets the judgement and order IDs for the advanced search results described by Marcel: 

In advanced search, I would do the following:
1. In ‘Collection’, select ‘EU Case Law’
2. In Document reference block under ‘Type’, you would need ‘Judgment’ and ‘Order’. 
3. Since you cannot search simultaneously in both, you need two searches.
4. In the ‘subject matter’ field under ‘Theme’ (last block), you select ‘(PROT) Consumer Protection’
5. Alternatively, you can select ‘Directory of case-law’ code 4.18 (after 2010). 
6. Make sure to check the box ‘show cases from both schemes’

In [11]:
#import the urllib library used to query a website
from urllib.request import urlopen

#specify the urls of the search results for judgements and orders on EUR-LEX
judgements_url = "http://eur-lex.europa.eu/search.html?searchEq=true&qid=1521964071141&DB_TYPE_OF_ACT=judgment&CASE_LAW_SUMMARY=false&DTS_DOM=EU_LAW&CT_CODED=PROT&typeOfActStatus=JUDGMENT&type=advanced&lang=en&SUBDOM_INIT=EU_CASE_LAW&DTS_SUBDOM=EU_CASE_LAW"
orders_url = "http://eur-lex.europa.eu/search.html?searchEq=true&qid=1521966304302&DB_TYPE_OF_ACT=order&CASE_LAW_SUMMARY=false&DTS_DOM=EU_LAW&CT_CODED=PROT&typeOfActStatus=ORDER&type=advanced&lang=en&SUBDOM_INIT=EU_CASE_LAW&DTS_SUBDOM=EU_CASE_LAW"

#Extract the html pages displaying the search results for judgements and orders and assign these to variables
judgements_page = urlopen(judgements_url)
orders_page = urlopen(orders_url)

#import BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup

#Parse the html in the page variables, and store them in Beautiful Soup format using the 'lxml' parser
soup_judgements_page = BeautifulSoup(judgements_page, "lxml")
soup_orders_page = BeautifulSoup(orders_page, "lxml")

#Get number of pages in judgements search results
j_onsubmit = soup_judgements_page.find('form', id='pagingForm').get('onsubmit')
j_numberStr = j_onsubmit.split(",")[1] 
j_numberStr = j_numberStr.replace(")", "");
j_numberStr = j_numberStr.replace(" ", "");
j_number = int(j_numberStr)

#Get number of pages in orders search results
o_onsubmit = soup_orders_page.find('form', id='pagingForm').get('onsubmit')
o_numberStr = o_onsubmit.split(",")[1] 
o_numberStr = o_numberStr.replace(")", "");
o_numberStr = o_numberStr.replace(" ", "");
o_number = int(o_numberStr)

#Get each result item tag
j_results_on_first_page = soup_judgements_page.find_all('td', class_='leftMetadata')
o_results_on_first_page = soup_orders_page.find_all('table', class_='documentTable')

#initialise result ids
judgement_ids = []
order_ids = []

#find the 10 result ids on this first page for judgements
for result in j_results_on_first_page:
    for ul in result.find_all('ul'):        
        for li in ul.find_all('li'):
            if (li.text[:13] == "CELEX number:"):
                judgement_ids.append(li.text[14:])
                
#find the 10 result ids on this first page for orders
for result in o_results_on_first_page:
    for td in result.find_all('td', class_='leftMetadata'):        
        for ul in td.find_all('ul'):
            count = 0
            ulLen = len(ul.find_all('li'))
            for li in ul.find_all('li'):
                count += 1
                if (ulLen == 2):
                    if (count == 1):
                        order_ids.append(li.text[14:])
                else:
                    if (count == 2):
                        order_ids.append(li.text[14:])
                
#print result ids
#print(order_ids)

print()
print("Judgements:")
print()

#Loop through each result page for judgements from page 2 
for x in range(2, j_number+1):
    current_judgements_result_page_url = judgements_url + '&page=' + str(x) 
    #Extract the html pages displaying the search results for judgements and orders and assign these to variables
    current_judgements_result_page = urlopen(current_judgements_result_page_url)
    #Parse the html in this page, and store it in Beautiful Soup format using the 'lxml' parser
    soup_current_judgements_page = BeautifulSoup(current_judgements_result_page, "lxml")
    #Get each result item tag
    results_on_page_x = soup_current_judgements_page.find_all('td', class_='leftMetadata')
    #Find the 10 result ids on this page for judgements
    for result in results_on_page_x:
        for ul in result.find_all('ul'):
            for li in ul.find_all('li'):
                if (li.text[:13] == "CELEX number:"):
                    judgement_ids.append(li.text[14:])

print()
print(judgement_ids)
print(len(judgement_ids))
print()

print()
print("Orders:")
print()

#Loop through each result page for orders from page 2 
for x in range(2, o_number+1):
    current_orders_result_page_url = orders_url + '&page=' + str(x) 
    #Extract the html pages displaying the search results for judgements and orders and assign these to variables
    current_orders_result_page = urlopen(current_orders_result_page_url)
    #Parse the html in this page, and store it in Beautiful Soup format using the 'lxml' parser
    soup_current_orders_page = BeautifulSoup(current_orders_result_page, "lxml")
    #Get each result item tag
    results_on_page_x = soup_current_orders_page.find_all('table', class_='documentTable')
    #Find the 10 result ids on this page for judgements
    for result in results_on_page_x:
        for td in result.find_all('td', class_='leftMetadata'):        
            for ul in td.find_all('ul'):
                count = 0
                ulLen = len(ul.find_all('li'))
                for li in ul.find_all('li'):
                    count += 1
                    if (ulLen == 2):
                        if (count == 1):
                            order_ids.append(li.text[14:])
                    else:
                        if (count == 2):
                            order_ids.append(li.text[14:])

print()
print(order_ids)                            
print(len(order_ids))
print()


Judgements:


['62016CJ0016', '62015CJ0434', '62015CJ0598', '62015CJ0596', '62016CJ0289', '62016CJ0186', '62016CJ0111', '62016CJ0247', '62016CJ0357', '62016CJ0133', '62016CJ0392', '62015CJ0621', '62016CJ0075', '62016CJ0296', '62015CJ0339', '62016CJ0146', '62015CJ0536', '62015CJ0568', '62015CJ0503', '62015CJ0562', '62015TJ0474', '62014CJ0421', '62015CJ0375', '62015CJ0282', '62015CJ0154', '62015CJ0119', '62015CJ0327', '62015CJ0667', '62015CJ0127', '62015CJ0177', '62015CJ0042', '62015CJ0149', '62014CJ0611', '62015CJ0113', '62014CJ0484', '62015CJ0310', '62015CJ0191', '62015CJ0168', '62015CJ0019', '62014CJ0476', '62014CJ0377', '62014CJ0381', '62014CJ0397', '62015TJ0100', '62014CJ0049', '62014CJ0157', '62014CJ0326', '62014CJ0008', '62014CJ0508', '62014CJ0032', '62014CJ0085', '62014CJ0416', '62014CJ0110', '62013TJ0337', '62014CJ0348', '62014CJ0207', '62012TJ0334', '62012TJ0296', '62013CJ0497', '62014CJ0195', '62014CJ0096', '62013CJ0388', '62013CJ0503', '62013CJ0143', '62013CJ0567', '62013CJ0

In [3]:
#!pip install bs4

# 2. Get citations for each judgement 
This part gets the citations for each judgement and appends the results to an existing empty CSV file `judgements.csv`: 

In [5]:
#import the urllib library used to query a website
from urllib.request import urlopen

#import csv i/o library
import csv

#import BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup

#url prefix for information about a particular judgement or order
result_url_prefix = "https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:"

#judgement ids
judgement_ids = ['62016CJ0016', '62015CJ0434']#, '62015CJ0598', '62015CJ0596', '62016CJ0289', '62016CJ0186', '62016CJ0111', '62016CJ0247', '62016CJ0357', '62016CJ0133', '62016CJ0392', '62015CJ0621', '62016CJ0075', '62016CJ0296', '62015CJ0339', '62016CJ0146', '62015CJ0536', '62015CJ0568', '62015CJ0503', '62015CJ0562', '62015TJ0474', '62014CJ0421', '62015CJ0375', '62015CJ0282', '62015CJ0154', '62015CJ0119', '62015CJ0327', '62015CJ0667', '62015CJ0127', '62015CJ0177', '62015CJ0042', '62015CJ0149', '62014CJ0611', '62015CJ0113', '62014CJ0484', '62015CJ0310', '62015CJ0191', '62015CJ0168', '62015CJ0019', '62014CJ0476', '62014CJ0377', '62014CJ0381', '62014CJ0397', '62015TJ0100', '62014CJ0049', '62014CJ0157', '62014CJ0326', '62014CJ0008', '62014CJ0508', '62014CJ0032', '62014CJ0085', '62014CJ0416', '62014CJ0110', '62013TJ0337', '62014CJ0348', '62014CJ0207', '62012TJ0334', '62012TJ0296', '62013CJ0497', '62014CJ0195', '62014CJ0096', '62013CJ0388', '62013CJ0503', '62013CJ0143', '62013CJ0567', '62013CJ0482', '62013CJ0537', '62013CJ0449', '62013CJ0310', '62013CJ0137', '62013CJ0222', '62013CJ0034', '62014CJ0169', '62012CJ0421', '62012TJ0198', '62013CJ0026', '62012CJ0609', '62012CJ0515', '62012CJ0565', '62012CJ0470', '62012CJ0226', '62012CJ0281', '62012CJ0413', '62012CJ0391', '62012CJ0059', '62012CJ0032', '62011CJ0435', '62012CJ0265', '62012CJ0299', '62011CJ0313', '62011CJ0657', '62011CJ0397', '62011CJ0636', '62011CJ0092', '62010TJ0301', '62011CJ0472', '62011CJ0415', '62011CJ0206', '62011CJ0428', '62010CJ0544', '62010CJ0602', '62011CJ0049', '62010CJ0618', '62010CJ0472', '62010CJ0453', '62011CJ0166', '62010CJ0292', '62011CJ0134', '62010CJ0360', '62010CJ0495', '62010CJ0070', '62010CJ0327', '62009CJ0323', '62010CJ0058', '62009CJ0442', '62010CJ0015', '62010CJ0014', '62009CJ0324', '62009CJ0065', '62010CJ0122', '62009CJ0543', '62010CJ0134', '62010CJ0016', '62008CJ0120', '62009CJ0108', '62009CJ0047', '62009CJ0159', '62008CJ0137', '62008CJ0540', '62009CJ0154', '62008CJ0222', '62008CJ0389', '62008CJ0393', '62009CJ0099', '62008CJ0484', '62008CJ0545', '62008CJ0511', '62008CJ0215', '62008CJ0317', '62008CJ0522', '62008CJ0304', '62008CJ0227', '62007CJ0424', '62008CJ0358', '62008CJ0040', '62007CJ0446', '62007CJ0489', '62007CJ0343', '62007CJ0487', '62008CJ0285', '62007CJ0261', '62008CJ0321', '62007CJ0509', '62007CJ0458', '62008CJ0282', '62007CJ0539', '62007CJ0205', '62007CJ0336', '62007CJ0230', '62007CJ0493', '62006CJ0533', '62006CJ0412', '62006CJ0244', '62006CJ0262', '62005CJ0429', '62005CJ0327', '62005CJ0381', '62006CJ0141', '62005CJ0343', '62006CJ0127', '62005CJ0315', '62005CJ0168', '62004CJ0356', '62004CJ0438', '62004CJ0177', '62005CJ0059', '62004CJ0127', '62003CJ0402', '62004CJ0033', '62004CJ0229', '62003CJ0350', '62003CJ0135', '62004CJ0107', '62004CJ0203', '62003CJ0211', '62003CJ0132', '62003CJ0336', '62003CJ0210', '62002CJ0434', '62003CJ0070', '62002CJ0239', '62002CJ0237', '62002TJ0177', '62002CJ0264', '62003CJ0312', '62003CJ0310', '62001CJ0216', '62002CJ0040', '62001CJ0316', '62000CJ0469', '62001CJ0108', '62001CJ0044', '62001CJ0229', '62000CJ0221', '62000CJ0421', '62001CJ0491', '62001CJ0392', '62001CJ0414', '62000CJ0473', '62001CJ0099', '62001CJ0386', '62000TJ0179', '62000CJ0066', '61999CJ0478', '62000CJ0400', '62000CJ0154', '62000CJ0052', '62000CJ0183', '62000CJ0168', '61999CJ0481', '61999CJ0372', '61999CJ0541', '61999CJ0112', '61999CJ0144', '61999CJ0203', '61999TJ0144', '61998CJ0312', '61998CJ0366', '61998CJ0240', '61998CJ0465', '61998CJ0208', '61998CJ0401', '61997CJ0140', '61997CJ0423', '61996CJ0289', '61997CJ0087', '61998CJ0319', '61997CJ0383', '61996CJ0410', '61996CJ0385', '61997CJ0129', '61996CJ0364', '61996CJ0045', '61995CJ0259', '61996CJ0083', '61995CJ0300', '61994CJ0321', '61994CJ0178', '61994CJ0192', '61994CJ0051', '61994CJ0085', '61993CJ0144', '61991CJ0146', '61992CJ0359', '61992CJ0091', '61993CJ0017', '61992CJ0285', '61991CJ0222', '61992CJ0011', '61991CJ0293', '61990CJ0373', '61990CJ0032', '61989CJ0369', '61989CJ0361', '61989CJ0241', '61988CJ0362', '61988CJ0329', '61988CJ0360', '61987CJ0382', '61987CJ0298', '61984CJ0176', '61984CJ0178', '61986CJ0098', '61985CJ0179', '61983CJ0207', '61983CJ0016', '61982CJ0227', '61982CJ0042', '61982CJ0094', '61982CJ0155', '61981CJ0124', '61981CJ0286', '61981CJ0261', '61981CJ0006', '61980CJ0193', '61980CJ0113', '61980CJ0056', '61980CJ0130', '61980CJ0053', '61980CJ0027']

#get citations for each judgement or order
for judgement_id in judgement_ids:
    #citations array for this judgement
    citations = []
    #get url for this particular judgement
    judgement_url = result_url_prefix + str(judgement_id)
    print(judgement_url)
    #open the page
    judgement_page = urlopen(judgement_url)
    #convert it to beautifulsoup format 
    soup_judgement_page = BeautifulSoup(judgement_page, "lxml")
    #get all list items in this web page
    li_results = soup_judgement_page.find_all('li')
    #loop through these until you find the citation list item
    for result in li_results:
        #check if this is the list item that lists all the citations for this judgement
        if (result.text[:11] == 'Instruments'):
            # if it is, loop through each, extract the citations, and write them to csv
            print(result)
            for link in res ult.find_all('a'):
                #if there is an href attribute
                if (link.has_attr('href')):
                    #extract the case number from the href
                    #this_citation = link['href'].split("./../../../legal-content/EN/AUTO/?uri=CELEX:",1)[1] 
                    this_citation = link.text
                    #link
                    if this_citation not in citations:
                        #print(this_citation)
                        citations.append(this_citation)
                        #write to csv
                        with open('judgements_2.csv', 'a', newline='') as csvfile:
                            #open file for writing
                            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
                            #write this citation to file
                            writer.writerow([judgement_id] + [this_citation])
    
                

https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:62016CJ0016
<li>Instruments cited in case law: <br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:11957E173">11957E173</a>: N 42<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E049">12008E049</a>: N 12<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E056">12008E056</a>: N 12<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E263">12008E263-L1</a>: N 2 42 43<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E263">12008E263</a>: N 11 14 18 25 27 30 31 33 40 43 44<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E267">12008E267</a>: N 44<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E288">12008E288</a>: N 3 11 18 26<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E288">12008E288-L5</a>: N 30 38<br/>
<a href="./../../../legal-content/EN/AUTO/?uri=CELEX:12008E288">12008E288-L4</a>: N 38<br/>
<a href="./../../../legal-content

# 3. Get citations for each order 
This part gets the citations for each order and appends the results to an existing empty CSV file `orders.csv`: 

In [3]:
#import the urllib library used to query a website
from urllib.request import urlopen

#import csv i/o library
import csv

#import BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup

#url prefix for information about a particular judgement or order
result_url_prefix = "https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:"

#order ids
order_ids = ['62017CO0126', '62017CO0119', '62017CO0446', '62017CO0344', '62017CO0232', '62017CO0259', '62016CO0356', '62016CO0535', '62014CO0381', '62015CO0637', '62014CO0568(01)', '62015CO0511', '62015CO0534', '62016CO0091', '62016CO0007', '62014CO0122', '62015CO0613', '62015CO0537', '62015TO0474', '62015CO0074', '62014TO0578', '62014TO0721', '62014CO0085', '62014TO0619', '62015CO0013', '62014CO0539', '62014CO0090', '62013CO0602', '62011TO0112', '62014CO0092', '62014CO0169', '62013CO0153', '62013CO0298', '62013CO0430', '62013TO0337', '62012CO0537', '62012CO0343', '62011CO0433', '62011CO0252', '62011CO0559', '62011CO0126', '62010CO0288', '62010CO0032', '62010CO0352', '62010CO0076', '62009CO0085', '62009CO0143', '62007TO0137', '62002TO0381', '62002TO0231', '62001TO0223', '62001CO0406', '62000CO0345', '61998CO0447', '61999TO0268', '61999TO0144', '61999TO0114', '61998TO0078', '61996TO0114', '61997TO0109']

#get citations for each judgement or order
for order_id in order_ids:
    #citations array for this order
    citations = []
    #get url for this particular order
    order_url = result_url_prefix + str(order_id)
    print(order_url)
    #open the page
    order_page = urlopen(order_url)
    #convert it to beautifulsoup format 
    soup_order_page = BeautifulSoup(order_page, "lxml")
    #get all list items in this web page
    li_results = soup_order_page.find_all('li')
    #loop through these until you find the citation list item
    for result in li_results:
        #check if this is the list item that lists all the citations for this order
        if (result.text[:11] == 'Instruments'):
            #if it is, loop through each, extract the citations, and write them to csv
            for link in result.find_all('a'):
                #if there is an href attribute
                if (link.has_attr('href')):
                    #extract the case number from the href
                    #this_citation = link['href'].split("./../../../legal-content/EN/AUTO/?uri=CELEX:",1)[1] 
                    this_citation = link.text
                    if this_citation not in citations:
                        print(this_citation)
                        citations.append(this_citation)
                        #write to csv
                        with open('orders_2.csv', 'a', newline='') as csvfile:
                            #open file for writing
                            writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
                            #write this citation to file
                            writer.writerow([order_id] + [this_citation])
    

https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:62017CO0126
31993L0013-A01P1
31993L0013-A04
31993L0013-A03P1
31993L0013-A04P2
31993L0013
31993L0013-A06P1
31993L0013-A05
32012Q0929(01)-A99
62010CJ0453-N30
62013CJ0026-N77
62013CJ0026-N85
62015CJ0154-N50
62016CJ0186-N46
62016CJ0186-N43
62016CJ0186-N22
62016CJ0186-N47
62016CJ0186-N44
62016CJ0186-N45
62016CJ0186-N38
https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:62017CO0119
31993L0013-A03
31993L0013
31993L0013-A04P1
31993L0013-A03P1
31993L0013-A04P2
31993L0013-A05
31993L0013-A04
32012Q0929(01)-A99
62016CJ0186
62016CJ0186-N43
62016CJ0186-N40
62016CJ0186-N38
62016CJ0186-N36
62016CJ0186-N57
62016CJ0186-N53
62016CJ0186-N50
62016CJ0186-N55
62016CJ0186-N54
62016CJ0186-N56
62016CJ0186-N35
https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:62017CO0446
31993L0013-A01P1
31993L0013-A03
31993L0013-A02
31993L0013-A02LC
31993L0013-C13
31993L0013
31993L0013-A01
31993L0013-A01P2
32012Q0929(01)-A99
62011CJ0092-N26
62011CJ0092
620

KeyboardInterrupt: 

In [4]:
import pandas as pd

dataset = pd.read_csv("judgements_metadata.tsv", sep="\t")



In [5]:
dataset.head()

Unnamed: 0,ELI,CELEX number,Cellar reference,Date created,Latest modification of notice,Title,Form,EUROVOC descriptor,ECLI identifier,Subject matter,...,Judge-Rapporteur,Internal reference,In force indicator,Entry in the directory,OJ series,Class of the OJ,Number,Number of pages,Publication Reference,Unnamed: 67
0,,62016CJ0016,cada0110-161c-11e8-9253-01aa75ed71a1,2018-02-20,03/04/2018 13:13:09,Judgment of the Court (Grand Chamber) of 20 Fe...,Judgment,,ECLI:EU:C:2018:79,"Approximation of laws, Consumer protection",...,Toader,,,,,,,,Digital reports (Court Reports - general - 'In...,
1,,62015CJ0434,753c564f-e567-11e7-9749-01aa75ed71a1,2017-12-20,28/03/2018 11:19:29,Judgment of the Court (Grand Chamber) of 20 De...,Judgment,,ECLI:EU:C:2017:981,"Approximation of laws, Internal market - Princ...",...,Šváby,,,,,,,,Digital reports (Court Reports - general - 'In...,
2,,62015CJ0598,e23ab220-db30-11e7-a506-01aa75ed71a1,2017-12-07,23/03/2018 09:55:56,Judgment of the Court (Fifth Chamber) of 7 Dec...,Judgment,,ECLI:EU:C:2017:945,"Consumer protection, Approximation of laws",...,Levits,,,,,,,,Digital reports (Court Reports - general),
3,,62015CJ0596,2cc017ba-d035-11e7-a7df-01aa75ed71a1,2017-11-23,20/03/2018 06:55:06,Judgment of the Court (Third Chamber) of 23 No...,Judgment,,ECLI:EU:C:2017:886,"Consumer protection, public health",...,Safjan,,,,,,,,Digital reports (Court Reports - general - 'In...,
4,,62016CJ0289,e0734cde-af26-11e7-837e-01aa75ed71a1,2017-10-12,28/03/2018 05:20:48,Judgment of the Court (Ninth Chamber) of 12 Oc...,Judgment,,ECLI:EU:C:2017:758,"Consumer protection, Agriculture and Fisheries...",...,Juhász,,,,,,,,,
