### General Notes

In [None]:
# Most useful (i.e. simplest) tutorial: https://www.datacamp.com/community/tutorials/python-xml-elementtree

### Libraries

In [17]:
import xml.etree.ElementTree as ET

### Files

In [18]:
file_name = 'ser561.xml'

# Reads in the data as an element tree class of data
tree = ET.parse(file_name)
root = tree.getroot()


### View XML Document

In [42]:
# XPath Expressions: Many times elements will not have attributes, they will only have text content. 
# Using the attribute .text, you can print out this content.

for element in root.iter('flag'):
    print(element.attrib, element.text)


{'type': 'unclear'} سیرات
{'type': 'unclear'} انتظام
{'type': 'unclear'} قبول
{'type': 'unclear'} بخیر
{'type': 'unclear'} None
{'type': 'unclear'} None
{'type': 'unclear'} تدمانها رمیه
{'type': 'unclear'} پرور
{'type': 'unclear'} آیین
{'type': 'meaning'} فراشاط ناک
{'type': 'meaning'} لاحظىدر
{'type': 'unclear'} None
{'type': 'unclear'} دعا آب
{'type': 'unclear'} یار
{'type': 'unclear'} نیز
{'type': 'unclear'} None


In [34]:
# There is a helpful way to see the whole document. 
# Any element has a .tostring() method. If you pass the root into the .tostring() method, 
# you can return the whole document. Within ElementTree (remember aliased as ET), .tostring() takes 
# a slightly strange form.
# Since ElementTree is a powerful library that can interpret more than just XML, 
# you must specify both the encoding and decoding of the document you are displaying as the string. 
# For XMLs, use 'utf8' - this is the typical document format type for an XML.

print(ET.tostring(root, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<document>

    <metadata>
        <unique_id>ser561</unique_id>
    </metadata>
    <body>
        <pg folio="13a" />
        <div type="marginalia">
            
            <ts type="inscriptio" />
            <lb />جنابعالیمولایم سلمه الله متعالی
            <lb />جناب مرحمت و شفقت پناهی وزارتپناه مهربانی پروانچی حفظه الباری
            <lb />دام دولته
            <lb /> معروض بر ضمیر منیر مهر تنویر خجسته مظاهر
            <flag type="unclear">سیرات</flag>
             تخمیر </div>
        <div type="section">
            <ts type="salutatio" />
            <lb /> عالیجاه رفیع جایگاه امیر الامراء
            <flag type="unclear">انتظام</flag>
             محیت و مؤدت ارتسام اعنی
            <lb /> مستغنی عن تعریف و توصیف را بمعرض
            <flag type="unclear">قبول</flag>
             چنین رسانیده
            <lb /> میشود که الحمد و المنه مجاریً احوال بکرم و دولت
            <flag type="unclear">بخیر</flag>
             بوده
            <lb /

In [29]:
# List the different attributes for a given tag; this works over all subtrees below the invoked tag
# You can expand the use of the iter() function to help with finding particular elements of interest. 
# root.iter() will list all subelements under the root that match the element specified. 

for div in root.iter('flag'):
    print (div.tag, div.attrib)

flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'meaning'}
flag {'type': 'meaning'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}
flag {'type': 'unclear'}


### XPath

In [39]:
# Understanding XPath is critically important to scanning and populating XMLs. 
# ElementTree has a .findall() function that will traverse the immediate children of the referenced element. 
# You can use XPath expressions to specify more useful searches.
# The function .findall() always begins at the element specified. 
# This type of function is extremely powerful for a "find and replace". You can even search on attributes!

for element in root.findall("./lang/[type="tajik"]"):
    print(element.attrib)


SyntaxError: invalid syntax (<ipython-input-39-a5516c11668b>, line 5)