# Import libraries

In [1]:
# Import libraries
import os
import csv

# Converting txt-files to csv-dataset
This is a very basic script to merge multiple text files into a csv-dataset. In addition, we also automated the extraction of the year and guild name based on the folder structure. Running the code below generates a CSV dataset that includes all guild regulations, along with the corresponding guild names and their year of issuance. Please first check if all paths are set up correctly by running the next cell.

In [2]:
# Path to current directory
current_directory = os.getcwd()

# Path to the parent directory containing month folders
parent_directory = os.path.normpath(os.path.join(current_directory, '../llm_digitization_data/regulations_text_data'))

# CSV file to write the data to
csv_file_path = os.path.normpath(os.path.join(current_directory, '../../datasets/regulations_dataset.csv'))

# Print the paths
print(f'Current directory: {current_directory}')
print(f'Parent directory: {parent_directory}')
print(f'CSV output file path: {csv_file_path}')

Current directory: /Users/niclasgriesshaber/Desktop/guilds-llm/01_llm_digitization/llm_digitization_code
Parent directory: /Users/niclasgriesshaber/Desktop/guilds-llm/01_llm_digitization/llm_digitization_data/regulations_text_data
CSV output file path: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/regulations_dataset.csv


In [3]:
# Function to extract year and guild name from the file name. The file name is always "[year]_[guild_name].txt"
def extract_year_and_guild(filename):
    # Split the filename on '_' and remove the file extension
    parts = filename.split('.')[0].split('_')
    year = parts[0]
    guild = parts[1]
    return year, guild

# Open the CSV file for writing
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['country', 'year', 'guild', 'text'])

    # Traverse through each folder and file
    for folder_name in os.listdir(parent_directory):
        folder_path = os.path.join(parent_directory, folder_name)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                if filename.endswith('.txt'):
                    file_path = os.path.join(folder_path, filename)
                    # Extract the year and guild from the file name
                    year, guild = extract_year_and_guild(filename)
                    # Read the content of the file
                    with open(file_path, 'r', encoding='utf-8') as txt_file:
                        text_content = txt_file.read()
                        # Write the year, guild, and text to the CSV file
                        writer.writerow([folder_name, year, guild, text_content])

print("Data extraction complete. CSV file created at:", csv_file_path)

Data extraction complete. CSV file created at: /Users/niclasgriesshaber/Desktop/guilds-llm/datasets/regulations_dataset.csv
