# Extraction of EU corporate bodies classification from the EU vocabularies

This notebook extracts the EU catogories of the EU organisations.The data source is the XML file avialble in the [corporte boy classification page](https://op.europa.eu/en/web/eu-vocabularies/dataset/-/resource?uri=http://publications.europa.eu/resource/dataset/corporate-body-classification).

ASSUMPTIONS:
- The XML file **corporate-body-classification.xml** must be available in the same directory as this notebook


In [1]:
import xml.etree.ElementTree as ET
import csv

## 1. Open the XML file: corporate-body-classification.xml

In [2]:
def open_file(file_name): 
    """
    open and parse the xml file given by file_name and return the parsed file and the root.
    
    """
    tree = ET.parse(file_name)
    root = tree.getroot()
    return tree, root

## 2. Extract records

In [3]:
def check_deprecated(deprecated_tag):
    """
    check whether a record is deprecated or not. It returns 1 --> deprecated or 0 --> not deprecated
    """
    
    if deprecated_tag == "false":
        is_deprecated = 0
    else:
        is_deprecated = 1
        
    return is_deprecated        
        

In [4]:
def EU_classification_extractor(child, target_tag):
    """
    extracts the EU classification code. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.tag == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1
        

In [5]:
def EU_classification_name_extractor(child, target_tag):
    """
     extracts the EU classification name in english. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.attrib["lg"] == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1    
        

In [6]:
def extract_info(root):
    """
    extract the AUTHORITY-CODE and NAME attributed related to each non-dep0recated record. It returns the extractec information as a string of dicts with one dict per record
    """

    record_attrib = {}
    EU_bodies_classification = []
    EU_bodies_classification_info = {}

    for record in root:
        record_attrib = record.attrib
        is_deprecated = check_deprecated(record_attrib["deprecated"])
        if not is_deprecated: # if the record is not deprecated       
            for child in record: # records sub-levels
                info_extracted = EU_classification_extractor(child, "authority-code") # extracts the EU body classification
                if  info_extracted != -1:
                    EU_bodies_classification_info["authority-code"] = info_extracted
                
                if child.tag == "label": # to extract the name of the classification in english
                    for child2 in child:
                        info_extracted =  EU_classification_name_extractor(child2, "eng") # extracts the EU body classification name in english
                        if  info_extracted != -1:
                            EU_bodies_classification_info["name"] = info_extracted
                            
                            EU_bodies_classification.append(EU_bodies_classification_info)
                            EU_bodies_classification_info = {} # clean up the variable
    
    return EU_bodies_classification                    
    
    

## 3. Save the extracted information into a csv file

In [7]:
def savetoCSV(EU_bodies_classification, filename): 
    """
    save the extracted information into a csv file
    """
  
    # specifying the fields for csv file 
    fields = ["authority-code", "name"] 
  
    # writing to csv file 
    with open(filename, 'w') as csvfile: 
  
        # creating a csv dict writer object 
        writer = csv.DictWriter(csvfile, fieldnames = fields) 
  
        # writing headers (field names) 
        writer.writeheader() 
  
        # writing data rows 
        writer.writerows(EU_bodies_classification)
    
    return

## 4. Main

In [8]:
def main():
    xml_file = "corporate-body-classification.xml"
    output_filename = "EU_corporate_body_classification.csv"

    tree, root = open_file(xml_file)
    EU_bodies_classification = extract_info(root)
    savetoCSV(EU_bodies_classification, output_filename)

## 5. Execution

In [9]:
if __name__ == "__main__": 
  
    # calling main function 
    main() 