In [1]:
import os
import shutil
import pandas as pd
import xml.etree.ElementTree as ET

import re

In [2]:
def tokenize_text(text):
    # Remove XML elements
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove slashes
    text = text.replace('/', '')
    
    # Remove punctuation except for space and newline
    text = re.sub(r'[^\w\s\n]', '', text)
    text = re.sub(r'\n', ' ', text)
    
    # Tokenize text by space and newline
    tokens = re.split(r'[\s\n]+', text)
    
    return tokens

In [3]:
# Function to extract data from XML files
def extract_data(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Extract class codes with the specified scheme
    class_codes = root.findall(".//{*}classCode[@scheme='https://www.deutschestextarchiv.de/doku/klassifikation#DTACorpus']")
    corpus = []
    valid_corpora = ['aedit', 'sbb_funeralschriften']
    for class_code in class_codes:
        if class_code.text in valid_corpora:
            corpus.append(class_code.text)
    
    # Ensure at least one valid corpus was found
    if not corpus:
        return None
    
    # Extract other relevant data
    title_elem = root.find(".//{*}sourceDesc/{*}biblFull/{*}titleStmt/{*}title[@type='main']")
    title = title_elem.text if title_elem is not None else None
    
    author_surname_elem = root.find(".//{*}sourceDesc/{*}biblFull/{*}titleStmt/{*}author/{*}persName/{*}surname")
    author_forename_elem = root.find(".//{*}sourceDesc/{*}biblFull/{*}titleStmt/{*}author/{*}persName/{*}forename")
    if author_surname_elem is not None and author_forename_elem is not None:
        author = f"{author_forename_elem.text} {author_surname_elem.text}"
    else:
        author = None
    
    pub_date_elem = root.find(".//{*}sourceDesc/{*}biblFull/{*}publicationStmt/{*}date")
    pub_date = pub_date_elem.text if pub_date_elem is not None else None
    
    pub_place_elem = root.find(".//{*}sourceDesc/{*}biblFull/{*}publicationStmt/{*}pubPlace")
    pub_place = pub_place_elem.text if pub_place_elem is not None else None
    
    tokens_elem = root.find(".//{*}extent/{*}measure[@type='tokens']")
    tokens = tokens_elem.text if tokens_elem is not None else None
    
    types_elem = root.find(".//{*}extent/{*}measure[@type='types']")
    types = types_elem.text if types_elem is not None else None
    
    characters_elem = root.find(".//{*}extent/{*}measure[@type='characters']")
    characters = characters_elem.text if characters_elem is not None else None
    
    text_elem = root.find(".//{*}text")
    text = tokenize_text(''.join(text_elem.itertext()) if text_elem is not None else None)
    
    
    return {
        'filename': os.path.basename(xml_file.replace('.TEI-P5.xml', '')),
        'title': title,
        'author': author,
        'pub_date': pub_date,
        'pub_place': pub_place,
        'tokens': tokens,
        'types': types,
        'characters': characters,
        'text': text,
        'corpus': corpus
    }

In [4]:

# Directory containing XML files
folder_path = 'dta_komplett_gebrauchsliteratur_2021-05-13'
os.makedirs('corpus', exist_ok=True)

# List to store extracted data
data = []

# Iterate through XML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xml'):
        xml_file = os.path.join(folder_path, filename)
        extracted_data = extract_data(xml_file)
        if extracted_data:
            data.append(extracted_data)
            shutil.copy(xml_file, os.path.join('corpus', filename))

# Create DataFrame
df = pd.DataFrame(data)
df['corpus'] = df['corpus'].apply(lambda x: [value for value in x if value != 'ready'])
# Display DataFrame
df.head()

Unnamed: 0,filename,title,author,pub_date,pub_place,tokens,types,characters,text,corpus
0,343012,Christliche Leich- vnd Ehren-Predigt bey der S...,Martinus Seidemannus,1625,Görlitz,3719,1521,24792,"[, Chriſtliche, Leich, vnd, EhrenPredigt, bey,...",[aedit]
1,343014,THRENOLOGIA. Christliche Leich vnd EhrenSermon...,Tobias Seiler,1605,Leipzig,10097,3476,73098,"[, THRENOLOGIA, Chriſtliche, Leich, vnd, Ehren...",[aedit]
2,343016,Kurtze Anleitung: Wie die jetzige böse Zeit/ d...,Christoph von Reideburg,1642,Breslau,11515,3440,78795,"[, Kurtze, Anleitung, Wie, die, jetzige, boſe,...",[aedit]
3,343019,"mors beatorvm EXODUS MISERIÆ TERRESTRIS, ET PA...",Christian Adolph,1636,Leipzig,7611,2546,50565,"[, mors, beatorvm, EXODUS, MISERIÆ, TERRESTRIS...",[aedit]
4,345804,Geistlich Paßport Oder Richtige Kundschafft de...,Leonhard Felber,1606,Oels,11482,3186,75172,"[, Geiſtlich, Paſsport, Oder, Richtige, Kundſc...",[aedit]


In [5]:
df.to_csv('collection.csv', index=False, encoding='utf-8-sig)')

In [6]:
df.iloc[2].title

'Kurtze Anleitung: Wie die jetzige böse Zeit/ darinnen zwar für sich selbst/ nichts/ alß eytel Klag/ Ach/ vnd Weh regieret'

In [9]:
df.iloc[-1]

filename                                      dach_seligen_1658
title         Auff seligen wiewol hochbetrawerlichen Hintrit...
author                                               Simon Dach
pub_date                                                   1658
pub_place                                            Königsberg
tokens                                                     1212
types                                                       666
characters                                                 7626
text          [, Auff, ſeligen, wiewol, hochbetrawerlichen, ...
corpus                                   [sbb_funeralschriften]
Name: 446, dtype: object