<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/notebooks/students/BDSS_TD1_XML_DTD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 1
## XML and DTD

Welcome to the support Python notebook for this TD. This notebook follows the paper version of the TD. 

The idea is to make the same exercises in a more interactive way, practice some Python and also discover or practice with Google Colab notebooks.

# Preambule
Import modules, define functions

Run this code to be able to run all the other things

In [None]:
# Check if lxml is installed. If it is not, install it using pip
!pip list | grep lxml

In [None]:
from lxml import etree
import re

In [None]:
# Functions to work with XML files

def validate_xml(xml_path:str, dtd_path:str) -> bool:
    ''' Validate an XML file  against a DTD using the lxml library
    '''
    try:
        dtd = etree.DTD(open(dtd_path))
    except etree.DTDParseError as ed:
        print(f"DTDParseError: {ed}")
        for i, er in enumerate(ed.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    try:
        xml_doc = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}")
        for i, er in enumerate(e.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    result = dtd.validate(xml_doc)
    if not result: print(dtd.error_log[0])

    return result

def write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers = None):
    ''' Write a list of strings into files. This strings should be XML and DTD files
    '''

    # If single strings are given, encapsulate them in lists  
    if all(map(lambda o: isinstance(o, str), [xml_strings, dtd_strings])):
        xml_strings, dtd_strings = [xml_strings], [dtd_strings]

    if len(xml_strings) != len(dtd_strings):
        raise Exception("Different number of XML and DTD strings!")

    # If no identifiers are given, create default ones. This determines file names
    if identifiers is None:
        identifiers = [f"file_{i}" for i in range(len(xml_strings))]

    try:
        for x, d, id in zip(xml_strings, dtd_strings, identifiers):
            xml_path, dtd_path = f"{id}.xml", f"{id}.dtd" 
            with open(xml_path,"w") as f:
                f.write(x)
            with open(dtd_path,"w") as f:
                f.write(d)
    except Exception as e:
        print("Problems while writing XML and DTD files")
        raise e

    return identifiers



def test_validation(xml_string, dtd_string, validator):
    ''' Validate an XML document against a DTD, both given as strings
    '''
    # Write files
    write_xml_dtd_files_from_strings(xml_string, dtd_string, identifiers = ['temp'])
    
    # Validate
    return validator("temp.xml", "temp.dtd" )

def xpath_query_xml_string(xml_string, query_string):
    xml_path = "xml_doc.xml"
    with open(xml_path, "w") as f:
        # Remove all whitespaces to keep the 'real' text of each node
        f.write(re.sub(">[\s|\n]*<", "><", xml_string))
        f.close()
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)

def xpath_query_xml_file(xml_path, query_string):
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)


def print_xpath_query_results(results):
    print(f"Total results: {len(results)}")
    print("*"*20 + "\n")
    for e in results:
        try:        
            print(f"node tag: {e.tag}")
            print(f"node text: *{e.text}*")
            print(', '.join([f"{k} = {v}"for k, v in e.items()]))
            print("-"*20)
        except:
            print("--Except")
            print(e)

## Save XML and DTD files from strings

In [None]:
# Dummy files to test with

xml_strings, dtd_strings, identifiers = [],[],[]

### ---- FILE 1 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT address (name,company,phone)>
<!ELEMENT name (#PCDATA)>
<!ELEMENT company (#PCDATA)>
<!ELEMENT phone (#PCDATA)>'''

# xmldoc.xml
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE address SYSTEM "dtddoc.dtd">
<address>
    <name>Beatrice </name>
    <company>Paris-Dauphine</company>
    <phone>06 12345678</phone>
</address>'''

dtd_strings.append(dtd_string)
xml_strings.append(xml_string)
identifiers.append("address_book")

### ---- FILE 2 ----

#dtddoc.dtd
dtd_string = \
'''<!ELEMENT address EMPTY>
<!ATTLIST address name CDATA #REQUIRED>
<!ATTLIST address company CDATA #IMPLIED>
<!ATTLIST address phone CDATA #REQUIRED>'''

# xmldoc.xml
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE address SYSTEM "dtddoc2.dtd">
<address name="Beatrice" phone="06 12345678"/>'''

dtd_strings.append(dtd_string)
xml_strings.append(xml_string)
identifiers.append("address_book_2")


In [None]:
# Write all the files to the temporal memory of the Colab session
test_files = write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers)

Test XML validation

In [None]:
# Test validation using strings
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
# Test with the files. 
# Make sure you saved the files using the previous section
xml_path, dtd_path = f"{test_files[0]}.xml", f"{test_files[0]}.dtd"
validate_xml(xml_path, dtd_path)

# Ex 1
Decide if the XML documents are correct. To validate them, the corresponding DTD file must be defined

In [None]:
# ---- Ex 1.1 ----


#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml.

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<html>
    <head>
        <title>Hello, World</title>
    </head>
    <body>
        <p>Hello, World</p>        
    </body>
</html>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.2 ----


#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>    
    <p> This is a test. This is a test of the <em>
    <strong>Emergency</em> Broadcast System.</strong></p>'''


xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.3 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <note date="12/11/2007">
        <!-- This is a comment -->
        <Message>
            <to>Tove</to>
            <from>Jani</from>
            <heading>Reminder</heading>
            <body>Dont forget me this weekend!</body>
        </message>
    </note>
    <note date="13/11/2007">
        <message>
            <to>Jani</to>
            <from>Tove</from>
            <heading>Re: Reminder</heading>
            <body>Ok!</body>
        </message>
    </note>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.4 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 
# XML document is correct ! Make sure to add the namespace in the DTD file
# as an attribute

xml_string = \
'''<?xml version="1.0" encoding="utf-8"?>
<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" 
xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="points">
        <xs:complexType>
            <xs:sequence>
                <xs:element maxOccurs="unbounded" name="point">
                    <xs:complexType>
                        <xs:attribute name="x" type="xs:unsignedShort" use="required" />
                        xs:attribute name="y" type="xs:unsignedShort" use="required" />
                    </xs:complexType>
                </xs:element>
            </xs:sequence>
        </xs:complexType>
    </xs:element>
</xs:schema>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.5 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<html>
    <head><title>Paragraphs</title></head>
    <body>
        <p>This is a paragraph.<br/>
        <p>This is another paragraph.<br/>
        <p>Third paragraph.
    </body>
</html>'''


xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.6 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
    <rdf:Description rdf:about=http://www.AcronymFinder.com/>
        <dc:title>Acronym Finder</dc:title>
        <dc:description>The Acronym Finder is a world wide
            web (WWW) searchable database of more than 169,000
            abbreviations and acronyms about computers,
            technology, telecommunications, and military
            acronyms and abbreviations.</dc:description>
        <dc:subject>
            <rdf:Bag>
                <rdf:li>Astronomy</rdf:li>
                <rdf:li>Literature</rdf:li>
                <rdf:li>Mathematics</rdf:li>
                <rdf:li>Music</rdf:li>
                <rdf:li>Philosophy</rdf:li>
            </rdf:Bag>
        </dc:subject>
    </rdf:Description>
</rdf:RDF>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.7 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <html>
        <body>
            <p><b><i>This paragraph is bold and italic.</b></i></p><br/>
            <p><i><b>This paragraph is italic and bold.</i></b></p><br/>
        </body>
    </html>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.8 ----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<catalog>
    <work type='prose' date='1906'>
        <title>The Gift Of The Magi</title>
        <author>O. Henry</author>
    </work>
    <work type='poem' date='1845'>
        <title>The Raven</title>
        <author>Edgar Allen Poe</author>
    </work>
    <work type='play' date='1601'>
        <title>Hamlet</title>
        <author>William Shakespeare</author>
    </work>
</catalog>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.9----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 
# XML document is correct !

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<letter>
    <date>December 11, 2002</date>
    <addressee>
        <name>Melvile Dewey</name>
        <address_one>Columbia University</address_one>
        <address_two>New York, NY</address_two>
    </addressee>
    <greeting>Dear Melvile,</greeting>
    <paragraph>I have been reading your ideas concerning nature of
    librarianship, and <italics>I find them very intriguing</italics>.
    I would love the opportunity to discuss with you the role of the
    card catalog in today’s libraries considering the advent to World
    Wide Web. Specifically, how are things like Google and Amazon.com
    changing our patrons’ expectations of library services? Mr. Cutter
    and I will be discussing these ideas at the next Annual Meeting,
    and we are available at the follow dates/times:</paragraph>
    <list>
    <item>Monday, 2-4</item>
    <item>Tuesday, 3-5</item>
    <item>Thursday, 1-3</item>
    </list>
    <paragraph>We hope you can join us.</paragraph>
    <closing>Sincerely, S. R. Ranganathan</closing>
</letter>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.10----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 


xml_string = \
'''<?xml version="1.0"?>
<dictionary>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">XML</Name>
        <description>eXtensible Markup Language</description>
    </word>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">POP</name>
        <definition default>Post Office Protocol</definition>
        <definition>Point Of Purchase</definition>
</dictionary>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.11----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<domain type='kvm>
    <name>domain</name><
    <memory>524288</memory>
    <vcpu>2</vcpu>
    <features><acpi/><pae/>
    <clock offset='utc'>
    <disk type='block' device='cdrom'>
        <driver name='qemu' type='raw'/>
        <source file='/path/to/image.iso'/>
        <tar get dev='hdc' bus='ide'/>
        <readonly/></name>
    </disk>
</domain>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.12----
#dtddoc.dtd
dtd_string = \
'''
DTD goes here
'''

# xmldoc.xml. 


xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<name>Oyster Soup</name>
<author>Eric Lease Morgan</author>
<copyright holder=Eric Lease Morgan>&copy; 2003</copyright>
<ingredients>
<list>
<item>1 stalk of celery
<item>1 onion
<item>2 tablespoons of butter
<item>2 cups of oysters and their liquor
<item>2 cups of half & half
</list><cost>total cost < 36 euro </cost>
</ingredients>
<process><P>Begin by sauteing the celery and onions in butter until soft.
Add oysters, oyster liquor, and cream. Heat until the oysters float.
Serve in warm bowls.</p>
<p><i>Yummy!</p></i>
</process>'''

xml_string_correct = \
'''
Correction goes here
'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

# Ex 2
Write a DTD file for the given XML file

In [None]:
xml_string = '''<?xml version="1.0"?>
<shiporder orderid="889923">
    <orderperson>John Smith</orderperson>
    <shipto>
        <name>Ola Nordmann</name>
        <address>Langgt 23</address>
        <city>4000 Stavanger</city>
        <country>Norway</country>
    </shipto>
    <item>
        <title>Empire Burlesque</title>
        <note>Special Edition</note>
        <quantity>1</quantity>
        <price>10.90</price>
    </item>
    <item>
        <title>Hide your heart</title>
        <quantity>1</quantity>
        <price>9.90</price>
    </item>
</shiporder>'''

dtd_string = \
'''
DTD goes here
'''

test_validation(xml_string, dtd_string, validate_xml)

# Ex 3
Write a XML file for the given DTD file

In [None]:
dtd_string = \
''' <!ELEMENT stock (new-car | used-car)*>
    <!ELEMENT new-car (model, price)>
    <!ELEMENT used-car (model, price, mileage, condition?)>
    <!ELEMENT model (#PCDATA)>
    <!ELEMENT price (#PCDATA)>
    <!ELEMENT mileage (#PCDATA)>
    <!ELEMENT condition (#PCDATA)>
'''

xml_string = \
'''
XML goes here
'''
test_validation(xml_string, dtd_string, validate_xml)

# Ex 4
Create a XML file and the corresponding DTD file following the described situation

Envisager une application dans laquelle les résultats des matchs de football doivent être représentés en XML.
Pour chaque **jeu**, nous voulons être en mesure de représenter les **deux équipes** impliquées, l’équipe qui jouait **chez eux**, quels **joueurs** ont marqué des **buts** (dont certains peuvent avoir été **pénalités**) et le **moment** où chacun a été marqué, et quels joueurs ont reçu des cartons **jaunes** et **rouges**.

In [None]:
dtd_string = \
'''
DTD goes here
'''

xml_string = \
'''
XML goes here
'''


test_validation(xml_string, dtd_string, validate_xml)

# Ex 5
XPath axis: write the nodes that will be given as the result of each axis

To see this, we will replicate the tree with a dummy XML file and do all the queries

In [None]:
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT node ANY>
    <!ATTLIST node id CDATA #IMPLIED>
    <!ATTLIST node attribute CDATA #IMPLIED>
    <!ELEMENT text ANY>
'''

xml_string = \
'''<?xml version="1.0"?>
    <root>
        <node id="1" attribute="node 2 - value 7">
            <node id="3">
                <node id="8">
                    <text>17</text>
                </node>
                <node id="9">
                </node>
            </node>
            <node id="4" attribute="node 10 - value None">
                <node id="11" attribute="node 18 - value None">
                    <text>19</text>
                </node>
                <node id="12" attribute="node 20 - value None">
                    <node id="21">
                    </node>
                    <node id="22" attribute="node 25 - value None">
                        <text>26</text>
                    </node>
                    <node id="23">                        
                    </node>
                    <text>24</text>
                </node>
            </node>
            <node id="5">
                <node id="13">
                </node>
                <node id="14">                    
                </node>
            </node>
            <node id="6" attribute="node 15 - value None">                
                <text>16</text>
            </node>
        </node>
    </root>
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
""" Write the query here """

query_string = 'Queries go here'

# -------------------------------------------

results = xpath_query_xml_string(xml_string, query_string)
print_xpath_query_results(results)

# Ex 6
Create an XML file where both queries yield the same result

In [None]:
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT cours ANY>
    <!ATTLIST cours id CDATA #IMPLIED>
    <!ELEMENT intitule ANY>
    <!ELEMENT XML ANY>
'''

xml_string = \
'''
XML goes here
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
query_string_1 = "//cours[intitule='XML']"
query_string_2 = "//cours[intitule=XML]"

# First query
results = xpath_query_xml_string(xml_string, query_string_1)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# Second query
results = xpath_query_xml_string(xml_string, query_string_2)
print_xpath_query_results(results)

# Ex 7

Explain the difference between the two queries and show a document where they yield different outputs

In [None]:
dtd_string = \
'''
DTD goes here
'''


xml_string = \
'''
XML goes here
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
query_string_1 = "//B[position()=1]"
query_string_1_extended = "/descendant-or-self::node()/B[position()=1]"
query_string_2 = "/descendant::B[position()=1]"

# First query
results = xpath_query_xml_string(xml_string, query_string_1)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# First query extended
results = xpath_query_xml_string(xml_string, query_string_1_extended)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# Second query
results = xpath_query_xml_string(xml_string, query_string_2)
print_xpath_query_results(results)


# Ex 8
Perform the XPath queries to the Films database

The cell below downloads the files so that you can work on them later

In [None]:
dtd_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.dtd"
xml_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml"

!rm "./films.dtd"
!rm "./films.xml"

# Download the imdb sample file
!wget {dtd_link}
!wget {xml_link}

# If the download fails, you will have to load the files into the Colab session. 
# Go to the Files section on the left panel

if validate_xml("films.xml", "films.dtd"):
    print("Files were downloaded correctly")

In [None]:
with open("films.dtd") as f:
    for i in range(5):
        print(f.readline().rstrip("\n"))

with open("films.xml") as f:
    for i in range(10):
        print(f.readline().rstrip("\n"))

In [None]:
""" Write the query here """

query_string = 'Query here'


# -------------------------------------------
xml_path = "films.xml"
results = xpath_query_xml_file(xml_path, query_string)
print_xpath_query_results(results)