In [2]:
import os
import re
import xml.etree.ElementTree as et

In [3]:
# a possible solution
def scrape_directory_tree(starting_dir, file_ending):
    if starting_dir:
        try:
            assert os.path.isdir(starting_dir) # do not pass a filename
        except:
            print(starting_dir, 'is not a directory?')
            return None
    matching_files = []
    for current_dir, directories, files in os.walk(starting_dir, topdown=True):
       for filename in files:
           if filename.endswith(file_ending):
               matching_files.append(os.path.join(current_dir, filename))
    print('found', len(matching_files), 'files ending in', file_ending)
    return matching_files

def list_files_by_size(filepathlist, decreasing=True):
    sorted_files = sorted(filepathlist, key=os.path.getsize, reverse=decreasing)
    return sorted_files

def get_root_tag(xml_filepath):
    return et.parse(filepath).getroot()

def get_text_paragraphs(xml_root,namespace={'tei': 'http://www.tei-c.org/ns/1.0'}):
    # given the root tag of a well formed tei xml file
    # assumes there is only a single div tag and returns
    # a list of paragraph tags inside
    text_pars = xml_root\
        .find('tei:text', namespace)\
        .find('tei:body', namespace)\
        .find('tei:div1', namespace)\
        .findall('tei:p', namespace)
    return text_pars

def get_text_divs(xml_root, namespace={'tei': 'http://www.tei-c.org/ns/1.0'}):
    # given the root tag of a well formed tei xml file, 
    # returns a list of the div tags in the text body.
    text_divs = xml_root\
        .find('tei:text', namespace)\
        .find('tei:body', namespace)\
        .findall('tei:div1', namespace)
    return text_divs

def get_div_paragraphs(text_divs, namespace={'tei': 'http://www.tei-c.org/ns/1.0'}):
    """
    given a list of div tags of a well formed tei xml file
    returns a concatenated list of all paragraphs found in 
    all the divs
    """
    div_pars = []
    for div in text_divs:
        div_pars.extend(div.findall('tei:p', namespace))
    return div_pars

def get_paragraph_lemmas(text_paragraphs, namespace={'tei': 'http://www.tei-c.org/ns/1.0'}):
    text_lemmas = []
    for paragraph in text_pars:
        for sentence in paragraph.findall('tei:s', namespace):
            for word in sentence.findall('tei:w',namespace):
                text_lemmas.append(word.attrib['lemma'].lower())
                #print(word.attrib['lemma'].lower())
    return text_lemmas

def get_paragraph_words(text_paragraphs, namespace={'tei': 'http://www.tei-c.org/ns/1.0'}):
    text_words = []
    for paragraph in text_pars:
        for sentence in paragraph.findall('tei:s', namespace):
            for word in sentence.findall('tei:w',namespace):
                text_words.append(word.attrib['lemma'].lower())
                #print(word.attrib['lemma'].lower())
    return text_lemmas

In [5]:
folder = '/Users/oholm/annad/ordtidni/data/10'

# establish the data folder:
# USER_HOME = os.path.expanduser('~')
# path to root directory of the project.
#BASE_DIR = os.path.join(USER_HOME, 'annad/ordtidni/') 
BASE_DIR = os.path.join('.', '..') 
DATA_DIR = os.path.join(BASE_DIR, 'data/')

filenames = [f for f in os.listdir(folder) if re.match(r'.*\.xml', f)]
len(filenames)

342

In [6]:
filelist = scrape_directory_tree(folder, 'xml')
filelist_by_size = list_files_by_size(filelist)

found 342 files ending in xml


In [8]:
filename = filenames[0]
#filepath = os.path.join(DATA_DIR, filename) 
filepath = os.path.join(folder, filename) 
tree = et.parse(filepath)
root = tree.getroot()

In [9]:
namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}
root = get_root_tag(filepath)
divs = get_text_divs(root,namespace)
pars = get_div_paragraphs(divs, namespace)
lemma_list = get_paragraph_lemmas(pars, namespace)
print(lemma_list)

NameError: name 'text_pars' is not defined

In [69]:
div_pars = []
for div in divs:
    div_pars.extend(div.findall('tei:p', namespace))
div_pars

[<Element '{http://www.tei-c.org/ns/1.0}p' at 0x10a5f61d8>]

In [24]:
print(root.tag)
print(root.text)
print(root.attrib)
print(root.get('TEI'))
namespace = {'tei': 'http://www.tei-c.org/ns/1.0'}
#print(root.get('TEI').get('teiHeader'))
print(root.find('tei:text', namespace))

{http://www.tei-c.org/ns/1.0}TEI

  
{}
None
<Element '{http://www.tei-c.org/ns/1.0}text' at 0x10a221458>


In [39]:
# print(root.find('tei:text', namespace))
text_root = root.find('tei:text', namespace)
# print(text_root.find('tei:body', namespace))
text_body = text_root.find('tei:body', namespace)
# print(text_body)
text_div = text_body.find('tei:div1', namespace)
# print(text_div)
text_pars = text_div.findall('tei:p', namespace)

In [38]:
text_lemmas = []
text_words = []
for paragraph in text_pars:
    #print(paragraph, paragraph.tag, paragraph.attrib, paragraph.text)
    for sentence in paragraph.findall('tei:s', namespace):
        for word in sentence.findall('tei:w',namespace):
            text_lemmas.append(word.attrib['lemma'].lower())
            text_words.append(word.text.lower())
            #print(word.attrib['lemma'].lower())
print(len(text_lemmas))
print(text_lemmas[0:5], "...", text_lemmas[-5:-1])

144
['neytendasamtök', 'á', 'norðurlönd', 'hafa', 'senda'] ... ['verða', 'alltaf', 'taka', 'af']


In [20]:

for child in root:
    print('tag=', child.tag, child.attrib, child.text)

tag {http://www.tei-c.org/ns/1.0}teiHeader {} 
    
tag {http://www.tei-c.org/ns/1.0}text {} 
    
