# Parsing of XML

In [1]:
import xml.etree.ElementTree as ET
import glob
import pandas as pd

## Exploring xml (https://www.datacamp.com/community/tutorials/python-xml-elementtree)

In [2]:
#loading the XML trees
#tree = ET.parse('../data/EdixiXMLExport_20161.xml')
tree = ET.parse('../data/EdixiXMLExport_20091.xml')

In [3]:
#defining the root
root = tree.getroot()

In [4]:
#checking root
root.tag

'EdixiData'

In [5]:
#defining all emelemts that are following down through the tree, from the root. 
all_elements = [elem.tag for elem in root.iter()]

In [6]:
#the elements are in a list from here
#all_elements

It is obvious that the list of elements follows a structure. All of the values will have to be imputed into a pandas at some point, hence as MVP just impute what might seems to be the easiest. 

In [7]:
#looking at the first 6 elements in the tree (they will quickly become the same, as some of the elements are the same)
#uncomment print-statements to see 
for element in all_elements:
    #print("\n",element)
    for i, obj in enumerate(root.iter(element)):
        #print(obj.text)
        if i==5:
            break

In [8]:
#checking the different names (uncomment to see)
#for meeting in root.iter('Navn'):
#    print(meeting.text)

#### It is obvious how to get the different things out --> this is made into a pandas, where every row holds the different arrtibutes for a given speech

The idea is, that I am taking out all the important things for each speech by the following attributes:
- 'Tale'
- 'Starttid'
- 'Sluttid'
- 'Navn'
- 'Rolle'
- 'Tekst'

The above mentioned are all attributes that are specific for every single speech, and will therefore be the easiest to process. All functions are build-up in order to process all XML files at once. 

## Helper functions

In [9]:
def xml_root(xml_name):
    """
    input
        xml_name: the path of the xml file that I want to read
        
    output
        root: the root of the file
    """
    #defining tree
    tree = ET.parse(xml_name)
    #defining the root
    root = tree.getroot()
    
    return root

In [1]:
def iter_element(root, name_of_element):
    """
    envoked in xml_to_pandas-function
    
    input
        root: the root of the XML file
        name_of_element: the element that is wanted from the root, e.g. "Tekst" from one of the speeches. 
        
    output
        emp_list: a list containing all the individual items from the wanted elements, e.g. all the speeches from the
                  "Tekst" element. 
    """
    
    #defining an empty list
    emp_list = []
    
    #looping through all the elements of "name_of_element"
    for el in root.iter(name_of_element):
        var = str(el.text)
        
        #if the variable is empty, it is defined like that
        if var == "":
            var = 'empty'
        
        #appended to list
        emp_list.append(var)
        
    return emp_list

In [34]:
def xml_to_pandas(xml_root, cols):
    """
    input
        xml_root: the root of the XML-file
        
    output
        lists_: a list of lists, containing all the individual items from the wanted elements in "columns". 
        df: a dataframe where all the values are inserted. 
    """
    
    #defining the different elements that is interesting from the XML-file (which are also the columns in df)
    columns = cols
    
    #defining empty df
    df = pd.DataFrame(columns = columns)
    
    #making an empty list of lists where all the lists from `iter_element` function outputs are appended
    lists_ = []
    for element in columns:
        element_list = iter_element(xml_root, element)
        lists_.append(element_list)
    
    
    #adding all lists to df
    for i in range(len(columns)):
        df[columns[i]] = pd.Series(lists_[i])
    
    return lists_ , df

## Using functions

First, all the file names are defined, so that we can loop through the files and process it. 

In [35]:
#reading all files
xml_files = glob.glob('../data/*.xml')

In [50]:
xml_files

['../data/EdixiXMLExport_20102.xml',
 '../data/EdixiXMLExport_20101.xml',
 '../data/EdixiXMLExport_20111.xml',
 '../data/EdixiXMLExport_20161.xml',
 '../data/EdixiXMLExport_20141.xml',
 '../data/EdixiXMLExport_20142.xml',
 '../data/EdixiXMLExport_20151.xml',
 '../data/EdixiXMLExport_20121.xml',
 '../data/EdixiXMLExport_20131.xml',
 '../data/EdixiXMLExport_20091.xml']

In [48]:
#defining the different elements that is interesting from the XML-file (which are also the columns in df)
columns = ['Tale','Starttid','Sluttid','Navn','Rolle','Tekst']

#defining empty df
df = pd.DataFrame(columns = columns)
    
for i, file in enumerate(xml_files):
    #getting the root
    root = xml_root(file)
    
    #making the root into a pandas and its lists
    lists, df_ = xml_to_pandas(root , columns)
    
    #checking that lists are OK and appending to master df IF OK
    if len(set(map(len, lists))) < 2:
        df = df.append(df_)
    
    #printing info if not ok
    else:
        print(file,'is not OK. The lengths of the lists are:')
        for j in range(len(lists)):
            print(len(lists[j]))

../data/EdixiXMLExport_20161.xml is not OK. The lengths of the lists are:
48687
48687
48687
48687
48687
47420
../data/EdixiXMLExport_20141.xml is not OK. The lengths of the lists are:
36348
36348
36347
36348
36348
36348
../data/EdixiXMLExport_20151.xml is not OK. The lengths of the lists are:
58882
58882
58881
58882
58882
58881


In [53]:
#checking that there are no nan's in the df:
print(len(df))
df = df.dropna()
print(len(df))

236966
236966


So, to conclude, there are now ~235k different speeches from the Danish Parliament Corpus which are ready to be analyzed. The dataset is written to a csv, just to ensure that this process is not needed every time. 

In [54]:
df.to_csv('../data/DPC_speeches.csv')

#### There is a mistake in the XML for 2016 (EdixiXMLExport_20161)... 

To prove this load the file and run until here and check the tail. They are not addressing the right people in the end. 

In [None]:
#read the root for 2016
lists_ , df = xml_to_pandas(root)

In [None]:
#checking whether all the numbers are the same
for i in range(len(lists_)):
    print(len(lists_[i]))

In [None]:
k= 5675

In [None]:
#df.iloc[k:k+50]
df.tail(50)

#### Checks:

In [None]:
#in line 38 Thomas Danielsen is talking, so check if his name is mentioned in the line before
df.iloc[37,5]

In [None]:
#in line 40 Henning Hyllested is talking, so check if his name is mentioned in the line before
df.iloc[39,5]

In [None]:
df.iloc[43,5]

All good until line 50. 

In [None]:
#it fucks up around here...
df.iloc[5688,5]