<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/notebooks/students/BDSS_TD2_XML_DOM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 2
## XML validation, XPath and DOM in Python

Welcome to the support Python notebook for this TD. This notebook follows the paper version of the TD. 

The idea is to make the same exercises in a more interactive way, practice some Python and also discover or practice with Google Colab notebooks.

# Preambule

In [None]:
from lxml import etree
import re
from xml.dom.minidom import parse

# Functions to work with XML files

def validate_xml(xml_path:str, dtd_path:str) -> bool:
    ''' Validate an XML file  against a DTD using the lxml library
    '''
    try:
        dtd = etree.DTD(open(dtd_path))
    except etree.DTDParseError as ed:
        print(f"DTDParseError: {ed}")
        for i, er in enumerate(ed.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    try:
        xml_doc = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}")
        for i, er in enumerate(e.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    result = dtd.validate(xml_doc)
    if not result: print(dtd.error_log[0])

    return result

def write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers = None):
    ''' Write a list of strings into files. This strings should be XML and DTD files
    '''

    # If single strings are given, encapsulate them in lists  
    if all(map(lambda o: isinstance(o, str), [xml_strings, dtd_strings])):
        xml_strings, dtd_strings = [xml_strings], [dtd_strings]

    if len(xml_strings) != len(dtd_strings):
        raise Exception("Different number of XML and DTD strings!")

    # If no identifiers are given, create default ones. This determines file names
    if identifiers is None:
        identifiers = [f"file_{i}" for i in range(len(xml_strings))]

    try:
        for x, d, id in zip(xml_strings, dtd_strings, identifiers):
            xml_path, dtd_path = f"{id}.xml", f"{id}.dtd" 
            with open(xml_path,"w") as f:
                f.write(x)
            with open(dtd_path,"w") as f:
                f.write(d)
    except Exception as e:
        print("Problems while writing XML and DTD files")
        raise e

    return identifiers



def test_validation(xml_string, dtd_string, validator):
    ''' Validate an XML document against a DTD, both given as strings
    '''
    # Write files
    write_xml_dtd_files_from_strings(xml_string, dtd_string, identifiers = ['temp'])
    
    # Validate
    return validator("temp.xml", "temp.dtd" )

def xpath_query_xml_string(xml_string, query_string):
    xml_path = "xml_doc.xml"
    with open(xml_path, "w") as f:
        # Remove all whitespaces to keep the 'real' text of each node
        f.write(re.sub(">[\s|\n]*<", "><", xml_string))
        f.close()
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)

def xpath_query_xml_file(xml_path, query_string):
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)


def print_xpath_query_results(results):
    print(f"Total results: {len(results)}")
    print("*"*20 + "\n")
    for e in results:
        try:        
            print(f"node tag: {e.tag}")
            print(f"node text: *{e.text}*")
            print(', '.join([f"{k} = {v}"for k, v in e.items()]))
            print("-"*20)
        except:
            print("--Except")
            print(e)

# Examples with DOM

In [None]:
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE address SYSTEM "dtddoc.dtd">
	<carnet>
		<address name="Beatrice Napolitano" id="_1">
			<company>Paris-Dauphine</company>
			<phone>06 12345678</phone>
		</address>
		<address id="_2">
			<company>Paris-Dauphine</company>
			<phone>06 99999999</phone>
		</address>
	</carnet>'''

write_xml_dtd_files_from_strings(xml_string, "", identifiers = ["carnet"])

def example_getId():
	dom = parse("carnet.xml")
	print(dom.hasChildNodes())
	for n in dom.getElementsByTagName("address"):
		if (n.hasAttribute("name")):
			print(n.getAttribute("id"))
   

example_getId()

True
_1


# Ex 1 to 5
We will do like we did on Ex 1 from the last TD.
Write your DTD and XML files, and validate them using the given functions

In [None]:
#dtddoc.dtd
dtd_string = \
'''
DTD file goes here
'''

# xmldoc.xml.
# XML document is correct !

xml_string = \
'''
XML file goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))

# Ex 6
Now the idea is to practice the queries but using this new tool.

We will use the same sample files from the last TD

Remember here is the documentation for DOM: https://docs.python.org/3/library/xml.dom.html

In [None]:
dtd_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.dtd"
xml_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml"

!rm "./films.dtd"
!rm "./films.xml"

# Download the imdb sample file
!wget {dtd_link}
!wget {xml_link}

# If the download fails, you will have to load the files into the Colab session. 
# Go to the Files section on the left panel

if validate_xml("films.xml", "films.dtd"):
    print("Files were downloaded correctly")

In [None]:
dom = parse("films.xml")

# ----------------------------------------------------------------------------
# Query 1
# La liste des titres de films.
        

def dom_query_1(dom):
    """ Your code goes here, then keep going with the other functions for the
        other queries
    """
    return None

ans = dom_query_1(dom)
print("1.1\t", ans)


print("----"*20)

# ----------------------------------------------------------------------------
# Query 2
# Les titres des films parus en 1980.


# ----------------------------------------------------------------------------
# Query 3
# Le résumé d'Alien.


# ----------------------------------------------------------------------------
# Query 4
# Les titre des films avec Bruce Willis.


# ----------------------------------------------------------------------------
# Query 5
# Les titres des films qui ont un résumé.


# ----------------------------------------------------------------------------
# Query 6
# Les titres des films qui n'ont pas de résumé.


# ----------------------------------------------------------------------------
# Query 7
# Les titres des films vieux de plus de trente ans.


# ----------------------------------------------------------------------------
# Query 8
# Quel rôle joue Harvey Keitel dans Reservoir dogs ?


# ----------------------------------------------------------------------------
# Query 9
# Quel est le dernier film du document ?


