For this script to work the wanted Folders of this website http://lrec2022.gerparcor.texttechnologylab.org/ have to be in the same directory, and .tar have to be decomprised.

In [1]:
import os 
import pandas as pd
import gzip
import shutil
import xml.etree.ElementTree as ET

In [9]:
# Get the current directory
current_dir = os.getcwd()

# Loop through all items in the current directory
for item in os.listdir(current_dir)[2:]:
    # Check if the item is a directory
    if os.path.isdir(os.path.join(current_dir, item)):
        print(f"Directory: {item}")
        
        # Loop through files in this directory
        for file in os.listdir(os.path.join(current_dir, item)):
            print(f"  File: {file}")

Directory: Reichstag_NG_Zoll
  File: Reichstag
Directory: ThirdReich
  File: ThirdReich
Directory: Weimar_Republic
  File: Weimar_Republic


In [16]:
def decompress_gz_file(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

def process_file(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Get the root tag and some attributes to understand the structure
        root_tag = root.tag
        root_attrib = root.attrib
        # Get a sample of the elements
        elements_sample = [elem.tag for elem in root.iter()][:10]
        (root_tag, root_attrib, elements_sample)
    except ET.ParseError as e:
        str(e)
        
        
    # Extracting all elements and their attributes from the XMI file
    elements_data = []

    for elem in root.iter():
        elem_data = {
            "tag": elem.tag,
            "attributes": elem.attrib,
            "text": elem.text
        }
        elements_data.append(elem_data)
        
    elements_df = pd.DataFrame(elements_data)
    
    unique_tags = elements_df['tag'].unique()
    
    # Extract elements with the 'Sentence' tag and their attributes
    sentence_elements = elements_df[elements_df['tag'].str.contains('Sentence')]
   
    # Display the attributes of a sample sentence element to understand their structure
    sofa_elements = elements_df[elements_df['tag'].str.contains('Sofa')]
    
    # Extract the 'sofaString' attribute from the 'Sofa' element to get the full text
    full_text = sofa_elements.iloc[0]['attributes'].get('sofaString', None)
    
    # Extract sentences from the full text using the positional data
    extracted_sentences = []

    for index, row in sentence_elements.iterrows():
        begin = int(row['attributes']['begin'])
        end = int(row['attributes']['end'])
        sentence_text = full_text[begin:end]
        extracted_sentences.append({
            "id": row['attributes']['{http://www.omg.org/XMI}id'],
            "begin": begin,
            "end": end,
            "sentence": sentence_text
        })

    # Convert to a DataFrame for better visualization
    sentences_df = pd.DataFrame(extracted_sentences)
    
    
    # Locate elements with potential timestamp metadata
    metadata_elements = elements_df[elements_df['tag'].str.contains('DocumentMetaData|DocumentAnnotation|AnnotatorMetaData')]
    
   
    for index, row in metadata_elements.iterrows():
        attribs = row['attributes']
        if 'timestamp' in attribs:
            sentences_df["dateDay"] = attribs.get('dateDay')
            sentences_df["dateMonth"] = attribs.get('dateMonth')
            sentences_df["dateYear"] = attribs.get('dateYear')
            sentences_df["timestamp"] = attribs.get('timestamp')
    
    splits =  file_path.split("\\")

    out_csv = "clean/" + splits[len(splits) -1 ].split(".xmi")[0] + ".csv"

    
    print(f"Safe csv at {out_csv}")
    sentences_df.to_csv(out_csv, index = False)
  
    
    
def process_directory(base_dir):
    for root, dirs, files in os.walk(base_dir):
        #print(root)
        for file in files:
            file_path = os.path.join(root, file)
            
            if file.endswith('.gz'):
                output_file_path = os.path.splitext(file_path)[0]  # Remove the .gz extension
                try:
                    decompress_gz_file(file_path, output_file_path)
                    os.remove(file_path)  # Delete the .gz file after decompressing
                    process_file(output_file_path)  # Process the decompressed file
                    os.remove(output_file_path)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

            elif file.endswith('.xmi'):
                try:
                    process_file(file_path)  # Process the .xmi file directly
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
          
          

In [17]:
# Example usage
current_dir = os.getcwd()
process_directory(current_dir)

Safe csv at clean/71._Sitzung_28.03.1895.csv


In [None]:
# Define the path to the directory containing the CSV files
directory = 'clean'

# Initialize an empty list to hold the dataframes
dfs = []

# Loop over the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        filepath = os.path.join(directory, filename)
        # Read the CSV file into a dataframe
        df = pd.read_csv(filepath)
        # Append the dataframe to the list
        dfs.append(df)

# Concatenate all the dataframes in the list into a single dataframe
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('merged_parliament_texts.csv', index=False)

print("All CSV files have been successfully merged into 'merged_parliament_texts.csv'")

In [4]:
df = pd.read_csv("speeches_old.csv")
