Notebook for searching citations to or from EU legislation and / or EU case law. 

Users can give one or multiple CELEX IDs as input. The functions return a dictionaries with a nodes list and / or an edges list with each source - target pair as tuple.

Users have the option to export the edges list to a networkx graph. Basic code is provided.

Users can (but do not have to) compare the retrieved CELEX IDs with a predefined list of CELEX IDs (eg provided by experts). Code for evaluation metrics (F1, precision, recall) is provided.

# Dependencies

In [None]:
# install dependencies

#!pip install SPARQLWrapper

# Functions for Citation Retrieval

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_citations(source_celex, cites_depth=1, cited_depth=1):
    """
    Gets all the citations one to X steps away. Hops can be specified as either
    the source document citing another (defined by `cites_depth`) or another document
    citing it (`cited_depth`). Any numbers higher than 1 denote that new source document
    citing a document of its own.

    This specific implementation does not care about intermediate steps, it simply finds
    anything X or fewer hops away without linking those together.
    """    
    sparql = SPARQLWrapper('https://publications.europa.eu/webapi/rdf/sparql')
    sparql.setReturnFormat(JSON)
    sparql.setQuery('''
        prefix cdm: <http://publications.europa.eu/ontology/cdm#>
        prefix xsd: <http://www.w3.org/2001/XMLSchema#>

        SELECT DISTINCT * WHERE
        {
        {
            SELECT ?name2 WHERE {
                ?doc cdm:resource_legal_id_celex "%s"^^xsd:string .
                ?doc cdm:work_cites_work{1,%i} ?cited .
                ?cited cdm:resource_legal_id_celex ?name2 .
            }
        } UNION {
            SELECT ?name2 WHERE {
                ?doc cdm:resource_legal_id_celex "%s"^^xsd:string .
                ?cited cdm:work_cites_work{1,%i} ?doc .
                ?cited cdm:resource_legal_id_celex ?name2 .
            }
        }
        }''' % (source_celex, cites_depth, source_celex, cited_depth))
    ret = sparql.queryAndConvert()

    targets = set()
    for bind in ret['results']['bindings']:
        target = bind['name2']['value']
        targets.add(target)
    targets = set([el for el in list(targets) if el.startswith('3')]) #Filters the list...
    ###... Set filters based on filtertype (eg '3'=legislation, '6'=case law).
        
    return targets

def get_citations_multiple(sources, cites_depth=1, cited_depth=1, union=True):
    """
    Gets citations coming from multiple sources (given as a list of CELEX IDs).
    By default gets the union of all the resulting CELEXes, but of interest
    might be the intersect instead, returning only documents that are common
    between all the sources.
    """
    results = [get_citations(source, cites_depth, cited_depth) for source in sources]
    results.append(sources) #ensures that source nodes (ie starting points) are included in nodes list

    if union:
        return set().union(*results)    
    else:
        start_set = results[0]
        if len(sources) > 1:
            return start_set.union(*results[1:])
        else:
            return start_set

def get_citations_structure(source, cites_depth=1, cited_depth=1, dont_repeat=set()):
    if cites_depth > 0 and cited_depth > 0:
        cites, nodes1 = get_citations_structure(source, cites_depth, 0, dont_repeat)
        cited, nodes2 = get_citations_structure(source, 0, cited_depth, dont_repeat)
        return cites.union(cited), nodes1.union(nodes2)


    new_cites_depth = max(cites_depth - 1, 0)
    new_cited_depth = max(cited_depth - 1, 0)

    dont_repeat = dont_repeat.union({source})

    links = set()
    nodes = {source}
    targets = get_citations(source, min(cites_depth, 1), min(cited_depth, 1))

    for target in targets:
        nodes.add(target)
        # We're looking for citations from the source
        if cites_depth > 0:
            links.add((source, target))
        # Or to the source
        else:
            links.add((target, source))

        if new_cites_depth or new_cited_depth and target not in dont_repeat:
            new_links, new_nodes = get_citations_structure(target, new_cites_depth, new_cited_depth)
            links = links.union(new_links)
            nodes = nodes.union(new_nodes)

    return links, nodes

def get_citations_structure_multiple(sources, cites_depth=1, cited_depth=1):
    links = set()
    nodes = set(sources)
    for source in sources:
        if source.startswith('3'):
            new_links, new_nodes = get_citations_structure(source, cites_depth, cited_depth)
            links = links.union(new_links)
            nodes = nodes.union(new_nodes)
#            nodes = set([el for el in list(nodes) if el.startswith('3')]) #Filters the list. Filtertype: '3'=legislation, '6'=case law.
    return links, nodes

# Examples

Indicate the source node(s) and the cited_depth and cites_depth. In the example where C is the source node and A cites B and B cites C (A -> B -> C), cited_depth=2 means that the citations A -> B and B -> C are included. If cited_depth=1, only the citation A -> B is retrieved. Similarly, if A is the source node, a cites_detph=2 means that the citations A -> B and B -> C are included.

An example for retrieving citations for one given source:

    source = '32021R0664'
    links, nodes = get_citations_structure(source, cited_depth=1, cites_depth=2)

In case of multiple source nodes, the CELEX IDs can be put in a list. For example:

    sources = ['32019R0945','32021R0664']
    links, nodes = get_citations_structure_multiple(sources, cited_depth=0, cites_depth=2)

(it is also possible to input one source node in the list)

# CSV Export

In [None]:
# Prepare the edges list for csv
edges_list_for_csv = []
for i in links:
    to_add = i[0][1:]+','+i[1][1:]
    edges_list_for_csv.append(to_add)

# Write to file
with open('./output/edges_extracted/extracted_edges(network).csv', 'w', newline='') as f:  
    for entries in edges_list_for_csv:
        f.write(entries)
        f.write("\n")

# Build graph with NetworkX

In [None]:
# import
import networkx as nx

# Build complete network of docs
links, _ = get_citations_structure_multiple(sources, cites_depth=2, cited_depth=1) #set cites_depth, cited_depth

g = nx.Graph()
g.add_edges_from(links)

In [None]:
# saving graph created above in gexf format
nx.write_gexf(g, "./networkfiles/networkxile.gexf")

# Evaluation

If you want to compare the retrieved nodes with the nodes in a self-made overview.

Example:

docs_list = {
    '32018R1139',
    '32022R0868'
}

In [None]:
# Create overview

docs_list = {}

In [None]:
def do_stats(nodes, print_res=True):
    """
    Compares the retrieved nodes with the nodes in docs_list. 
    Returns total number of nodes as well as: 
    (1) Common nodes (ie nodes that appear both in the retrieved results and in docs_list), 
    (2) Missed nodes (ie nodes in docs_list that were not retrieved), and 
    (3) Extra nodes (ie retrieved nodes not in docs_list). 
    Returns evaluation metrics (precision, recall, F1).
    """
    nodes = set(nodes)

    precision = len(nodes.intersection(docs_list)) / float(len(nodes))
    recall = len(nodes.intersection(docs_list)) / float(len(docs_list))
    f1 = 2 * (precision * recall) / (precision + recall)
    if print_res:
        print(f'Total nodes found in search: {len(nodes)}')
        print(f'Precision: {precision}\nRecall: {recall}\nF1: {f1}')

        print(f'Common nodes ({len(nodes.intersection(docs_list))}): {nodes.intersection(docs_list)}')
        print(f'Missed nodes ({len(docs_list - nodes)}): {docs_list - nodes}')
        print(f"Extra nodes ({len(nodes - docs_list)}): {nodes - docs_list}")
    return (precision, recall, f1)

do_stats(nodes)

It is also possible to create a matrix of depths and to calculate stats for each combination of depth.

In [None]:
# import
import csv

# Create a list to store the results
results = []

# Go through matrix of depths and calculate stats for each
for i in range(5): #set depths
    for j in range(5): #set depths
        try:
            precision, recall, f1 = do_stats(get_citations_multiple(sources, i, j), print_res=False)
            results.append([i, j, precision, recall, f1])
            print(f'Cites depth: {i:02d} | Cited depth: {j:02d} | Pr {precision:.2f} Re {recall:.2f} F1 {f1:.2f}')
        except ZeroDivisionError:
            print("Tried dividing by zero, skipping")

# Save results to a CSV file
filename = './output/stats_evaluation/stats.csv'
with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Cites Depth', 'Cited Depth', 'Precision', 'Recall', 'F1 Score'])
    writer.writerows(results)

print(f'Results saved to {filename}')