In [137]:
import lxml.etree as et

# Traitement de données XML en Python

## Parsing simple

In [138]:
doc_sport = et.parse('data-sports/sport.xml')
doc_sport

<lxml.etree._ElementTree at 0x7c30f4ba1480>

## Validation avec XSD

In [139]:
schema_sport = et.parse('data-sports/sports.xsd')
schema_sport

<lxml.etree._ElementTree at 0x7c30f4c6c300>

In [140]:
xsd_sport = et.XMLSchema(schema_sport)
xsd_sport

<lxml.etree.XMLSchema at 0x7c30f4a9d210>

In [141]:
parser = et.XMLParser(schema=xsd_sport)
parser

<lxml.etree.XMLParser at 0x7c30f4bee170>

In [142]:
doc_sport = et.parse('data-sports/sport.xml', parser=parser)
doc_sport

<lxml.etree._ElementTree at 0x7c30f4bf8b00>

## DOM exploration
Traitement de la donnée movie.xml en DOM

In [143]:
title = 'Dune: Part Two'
title

'Dune: Part Two'

In [144]:
year = 2024
year

2024

In [145]:
doc_xsd_movie = et.parse('data-movies/movie.xsd')
parser_movie = et.XMLParser(schema=et.XMLSchema(doc_xsd_movie))
doc_movie = et.parse('data-movies/movie.xml', parser=parser_movie)
doc_movie

<lxml.etree._ElementTree at 0x7c30f4bba000>

In [146]:
try:
  doc_movie2 = et.parse('data-movies/movie-actor-no-name.xml', parser=parser_movie)
except et.XMLSyntaxError as ex:
  print(ex)

Element 'actor': The attribute 'name' is required but missing. (<string>, line 0)


In [147]:
root = doc_movie.getroot()
root

<Element movie at 0x7c30f4bb8640>

In [148]:
root.tag

'movie'

In [149]:
root.attrib

{'title': 'Dune: Part Two', 'year': '2024'}

In [150]:
root.attrib['year']

'2024'

In [151]:
root.getchildren()

[<Element director at 0x7c30f4ba3140>,
 <Element actors at 0x7c30fea392c0>,
 <Element synopsis at 0x7c30fea3aac0>]

In [152]:
synopsis = root.getchildren()[2]
synopsis

<Element synopsis at 0x7c30fea3aac0>

In [153]:
synopsis.tag

'synopsis'

In [154]:
synopsis.text

"Paul rejoint les Fremen et cherche\n\tà se venger des conspirateurs qui ont détruit sa famille, \n\tfaisant un choix entre l'amour et le sort de l'univers \n\tconnu alors qu'il s'efforce d'empêcher un avenir terrible \n\tque lui seul peut prédire.\n\t"

In [155]:
root.find('synopsis')

<Element synopsis at 0x7c30fea3aac0>

In [156]:
actors = root.find('actors')
actors

<Element actors at 0x7c30fea392c0>

In [157]:
for actor in actors.getchildren():
  print(actor.attrib)

{'name': 'Timothée Chalamet', 'role': 'Paul Atreides'}
{'name': 'Zendaya', 'role': 'Chani'}


In [158]:
new_actor = et.Element('actor')
new_actor.attrib['name'] = 'Rebecca Ferguson'
new_actor.attrib['role'] = 'Jessica'
new_actor

<Element actor at 0x7c30f4aacc40>

In [159]:
new_actor.attrib

{'name': 'Rebecca Ferguson', 'role': 'Jessica'}

In [160]:
actors.append(new_actor)

In [161]:
for actor in actors.getchildren():
  print(actor.attrib)

{'name': 'Timothée Chalamet', 'role': 'Paul Atreides'}
{'name': 'Zendaya', 'role': 'Chani'}
{'name': 'Rebecca Ferguson', 'role': 'Jessica'}


In [162]:
et.tostring(actors)

b'<actors>\n\t\t<actor name="Timoth&#233;e Chalamet" role="Paul Atreides"/>\n\t\t<actor name="Zendaya" role="Chani"/>\n\t<actor name="Rebecca Ferguson" role="Jessica"/></actors>\n\t'

In [163]:
et.tostring(root)

b'<movie title="Dune: Part Two" year="2024">\n\t<director name="Denis Villeneuve"/>\n\t<actors>\n\t\t<actor name="Timoth&#233;e Chalamet" role="Paul Atreides"/>\n\t\t<actor name="Zendaya" role="Chani"/>\n\t<actor name="Rebecca Ferguson" role="Jessica"/></actors>\n\t<synopsis>Paul rejoint les Fremen et cherche\n\t&#224; se venger des conspirateurs qui ont d&#233;truit sa famille, \n\tfaisant un choix entre l\'amour et le sort de l\'univers \n\tconnu alors qu\'il s\'efforce d\'emp&#234;cher un avenir terrible \n\tque lui seul peut pr&#233;dire.\n\t</synopsis>\n</movie>'

In [164]:
doc_movie.write('data-movies/movie_modified.xml', encoding='UTF-8', xml_declaration=True, pretty_print=True)

In [165]:
doc_movie.write('data-movies/movie_modified_cp1252.xml', encoding='CP1252', xml_declaration=True, pretty_print=True)

## XPath

In [166]:
doc_movie.xpath('/movie/synopsis')

[<Element synopsis at 0x7c30fea3aac0>]

In [167]:
doc_movie.xpath('/movie/synopsis/text()')

["Paul rejoint les Fremen et cherche\n\tà se venger des conspirateurs qui ont détruit sa famille, \n\tfaisant un choix entre l'amour et le sort de l'univers \n\tconnu alors qu'il s'efforce d'empêcher un avenir terrible \n\tque lui seul peut prédire.\n\t"]

In [168]:
doc_movie.xpath('/movie/actors')

[<Element actors at 0x7c30fea392c0>]

In [169]:
doc_movie.xpath('/movie/actors/actor')

[<Element actor at 0x7c30f4aade80>,
 <Element actor at 0x7c30f4aad500>,
 <Element actor at 0x7c30f4aacc40>]

In [170]:
actors.xpath('actor')

[<Element actor at 0x7c30f4aade80>,
 <Element actor at 0x7c30f4aad500>,
 <Element actor at 0x7c30f4aacc40>]

In [171]:
actors.xpath('parent::*')

[<Element movie at 0x7c30f4bb8640>]

In [172]:
new_actor.xpath('parent::*')

[<Element actors at 0x7c30fea392c0>]

In [173]:
new_actor.xpath('preceding-sibling::*')

[<Element actor at 0x7c30f4aade80>, <Element actor at 0x7c30f4aad500>]

In [174]:
new_actor.xpath('preceding-sibling::actor')

[<Element actor at 0x7c30f4aade80>, <Element actor at 0x7c30f4aad500>]

In [175]:
root.xpath('/*/*')

[<Element director at 0x7c30f4ba3140>,
 <Element actors at 0x7c30fea392c0>,
 <Element synopsis at 0x7c30fea3aac0>]

In [176]:
root.xpath('//actor')

[<Element actor at 0x7c30f4aade80>,
 <Element actor at 0x7c30f4aad500>,
 <Element actor at 0x7c30f4aacc40>]

In [177]:
root.xpath('//actor[1]')

[<Element actor at 0x7c30f4aade80>]

In [178]:
root.xpath('//actor[1]/@name')

['Timothée Chalamet']

In [179]:
root.xpath('//actor/@name')

['Timothée Chalamet', 'Zendaya', 'Rebecca Ferguson']

In [180]:
for name in root.xpath('//actor/@name'):
  print(name, len(name), sep=' : ')

Timothée Chalamet : 17
Zendaya : 7
Rebecca Ferguson : 16


In [181]:
root.xpath("//actor[@role = 'Chani']/@name")

['Zendaya']

In [182]:
root.xpath("string-length(//actor[@role = 'Chani']/@name)")

7.0

In [183]:
root.xpath("string-length(//actor/@name)")

17.0

In [184]:
root.xpath("//actor[string-length(@name) = 17]/@name")

['Timothée Chalamet']

In [185]:
root.xpath("//actor[string-length(@name) = 17]/@*")

['Timothée Chalamet', 'Paul Atreides']

In [186]:
# XPathEvalError: Unregistered function (XPath function not implemented by lxml)
# root.xpath("upper-case(//actor[string-length(@name) = 17]/@name)")