In [1]:
from bs4 import BeautifulSoup
import re

def extract_content(file_path):
    # Read the HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    content = []
    
    # Extract title
    title = soup.find('title')
    if title:
        content.append(("Title", title.text.strip()))
    
    # Extract contributors
    contributors = soup.find_all('meta', attrs={'name': 'dc.contributor'})
    if contributors:
        content.append(("Contributors", [c['content'] for c in contributors]))
    
    # Extract descriptions
    desc_de = soup.find('meta', attrs={'name': 'description', 'lang': 'de'})
    desc_en = soup.find('meta', attrs={'name': 'description', 'lang': 'en'})
    if desc_de:
        content.append(("Description (DE)", desc_de['content']))
    if desc_en:
        content.append(("Description (EN)", desc_en['content']))
    
    # Extract document details
    head = soup.find('div', class_='head')
    if head:
        content.append(("Document Details", head.get_text(strip=True, separator='\n')))
    
    # Extract manuscript details
    ms_details = soup.find('p', class_='msIdentifier')
    if ms_details:
        content.append(("Manuscript Details", ms_details.text.strip()))
    
    # Extract transcription
    transcript = soup.find('div', class_='transcript')
    if transcript:
        content.append(("Transcription", transcript.get_text(strip=True, separator='\n')))
    
    # Extract critical apparatus
    crit_app = soup.find('div', class_='critApp')
    if crit_app:
        content.append(("Critical Apparatus", crit_app.get_text(strip=True, separator='\n')))
    
    return content

# Specify the path to your HTML file
#file_path = '/Users/manaswimondol/Downloads/letter_transcripts/content.php-dir=edoc|ed000248&distype=optional&metsID=edoc_ed000248_lg-kl-hs15-01-01r-001-D_tei-transcript&xml=texts|Brief001-D_tei-transcript.xml&xsl=scripts|tei-transcript.xsl&view=diplomatic.html'
file_path = '/Users/manaswimondol/Downloads/letter_transcripts/content.php-dir=edoc|ed000248&distype=optional&metsID=edoc_ed000248_lg-kl-hs15-04-02v-030-L_tei-transcript&xml=texts|Brief030-L_tei-transcript.xml&xsl=scripts|tei-transcript.xsl&view=diplomatic.html'
# Extract the content
extracted_content = extract_content(file_path)

# Print the extracted content
for heading, text in extracted_content:
    print(f"\n{heading}:")
    if isinstance(text, list):
        for item in text:
            print(f"- {item}")
    else:
        print(text)
  


Title:
Bitte um materielle Unterstützung bei Renovierungsarbeiten am Kloster (Lüneburg, Kloster Lüne, Hs. 15, Lage 4, fol. 2v, Brief 30) - Transkription

Contributors:
- Herausgegeben von Eva Schlotheuber (Heinrich-Heine-Universität Düsseldorf)
- Herausgegeben von Henrike Lähnemann (Universität Oxford)
- Bearbeitet von Simone Schultz-Balluff (Universität Bonn)
- Bearbeitet von Edmund Wareham (Universität Oxford)
- Bearbeitet von Philipp Trettin (Heinrich-Heine-Universität Düsseldorf)
- unter Mitarbeit von Philipp Stenzig (Heinrich-Heine-Universität Düsseldorf)
- unter Mitarbeit von Timo Bülters ( Universität Bonn)
- unter Mitarbeit von Mai-Britt Wiechmann (Universität Oxford)
- digitale Umsetzung Wolfgang Seifert (Herzog August Bibliothek Wolfenbüttel)

Description (DE):
Die Absenderin erinnert ihren Onkel an sein Versprechen, sich in Mangelsituationen an ihn wenden zu können. Sie verweist auf Renovierungsarbeiten am Kloster, für die sie Holz benötigen. Der Empfänger möge die drei Onk

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

def extract_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

    content = {
        'File': os.path.basename(file_path),
        'Title': '',
        'Contributors': '',
        'Description (DE)': '',
        'Description (EN)': '',
        'Document Details': '',
        'Manuscript Details': '',
        'Transcription': '',
        'Critical Apparatus': ''
    }
    
    title = soup.find('title')
    if title:
        content['Title'] = title.text.strip()
    
    contributors = soup.find_all('meta', attrs={'name': 'dc.contributor'})
    if contributors:
        content['Contributors'] = '; '.join([c['content'] for c in contributors])
    
    desc_de = soup.find('meta', attrs={'name': 'description', 'lang': 'de'})
    if desc_de:
        content['Description (DE)'] = desc_de['content']
    
    desc_en = soup.find('meta', attrs={'name': 'description', 'lang': 'en'})
    if desc_en:
        content['Description (EN)'] = desc_en['content']
    
    head = soup.find('div', class_='head')
    if head:
        content['Document Details'] = head.get_text(strip=True, separator=' ')
    
    ms_details = soup.find('p', class_='msIdentifier')
    if ms_details:
        content['Manuscript Details'] = ms_details.text.strip()
    
    transcript = soup.find('div', class_='transcript')
    if transcript:
        content['Transcription'] = transcript.get_text(strip=True, separator=' ')
    
    crit_app = soup.find('div', class_='critApp')
    if crit_app:
        content['Critical Apparatus'] = crit_app.get_text(strip=True, separator=' ')
    
    return content

def process_directory(directory_path, output_file):
    all_content = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.html'):
            file_path = os.path.join(directory_path, filename)
            content = extract_content(file_path)
            all_content.append(content)
    
    df = pd.DataFrame(all_content)
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Data saved to {output_file}")

# Specify the directory containing your HTML files
directory_path = '/Users/manaswimondol/Downloads/Letters_content'

# Specify the output CSV file path
output_file = 'extracted_content.csv'

# Process all HTML files in the directory and save the results
process_directory(directory_path, output_file)