## Convert the [City] Building Codes --txt folder to CSV file format
This notebook aims converting the txt files in the city directory of building codes to a CSV file with the columns: filename, chapter title, date, and content for easier analysis.

<p>The notebook will create a total of two outputs:</p>
    <p>(1) a new directory/folder containing txt files combined if published in the same year and belonging to the same chapter</p>
    <p>(2) the csv file mentioned previously</p>

### Getting started
Make sure you have install all libraries before running any 'import" codes.
<li>Set the base directory to the folder path of the city's txt files</li>
<li>Set the output directory to the folder path where you want to store the combined txt files</li>
<li>Set the output csv path to the folder path where you want to store the outputted csv file...make sure it ends with the name you assign to the file along with the .csv extension</li>

In [None]:
import os
import pandas as pd

base_directory = "/replace_with_your_file_path/Los Angeles Building Codes (2012-2018)--txt"
output_directory = "/file_output_directory_for_combined_building_codes_by_year"
output_csv_path = "/output_directory_for_csv/building_code_data.csv"

# Create the output directory if it does not exist
os.makedirs(output_directory, exist_ok=True)

### Combine the txt files
Run the chunk below to output the combined txt files to the output directory.

In [None]:
def combine_txt_files(folder_path, output_directory):
    # Get the name of the current folder without the "_Codes" part
    folder_name = os.path.basename(folder_path)
    folder_name = folder_name.split("_Codes")[0]
    output_file_path = os.path.join(output_directory, f"{folder_name}.txt")

    # Create a dictionary to store text content based on the grouping key
    text_dict = {}

    # Iterate through files in the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)

                # Extract the grouping key from the file name
                if "-" in file:
                    key = file.split("-")[0]  # Get text before the first dash
                    if len(file.split("-")) > 1:  # Check if a second dash exists
                        key = file.split("-", 2)[1]  # Get text between the first and second dash
                else:
                    key = file.split(".txt")[0]  # If no dash, use the filename without extension

                # Combine text content for each key
                with open(file_path, 'r', encoding='utf-8') as input_file:
                    content = input_file.read()
                    if key in text_dict:
                        text_dict[key] += content
                    else:
                        text_dict[key] = content

    # Write combined text content to output files based on the keys
    for key, content in text_dict.items():
        combined_file_name = f"{folder_name}_{key}.txt" if not key.endswith('.txt') else f"{folder_name}_{key}"
        combined_file_path = os.path.join(output_directory, combined_file_name)
        with open(combined_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(content)

    print(f"All the text files from {folder_name} have been combined based on the key.")

# Call the function for each subfolder in the base directory
for subfolder in os.listdir(base_directory):
    subfolder_path = os.path.join(base_directory, subfolder)
    if os.path.isdir(subfolder_path):
        combine_txt_files(subfolder_path, output_directory)

### Create the CSV file
Run the chunk below after the previous in order to create the appropriate csv file save to the output_csv_path.

In [None]:
# Code for generating the CSV from combined files
filelocation = output_directory

files = [f for f in os.listdir(filelocation) if f.endswith(".txt")]

filenames = []
content = []
dates = []
chapters = []

for file in files:
    file_name, _ = os.path.splitext(file)  # Remove the ".txt" extension
    parts = file_name.split("_")
    
    # Extracting Date and Chapter from the filename parts
    date = parts[0] if len(parts) >= 1 else None
    chapter = "_".join(parts[1:]) if len(parts) > 1 else None
    
    file_path = os.path.join(filelocation, file)
    with open(file_path, 'r', newline='', encoding='utf-8') as source_file:
        filenames.append(file_name)
        dates.append(date)
        chapters.append(chapter)
        
        lines = source_file.read()
        content.append(lines)

data = {'Filename': filenames, 'Date': dates, 'Chapter': chapters, 'Content': content}

df = pd.DataFrame(data)
print(df)

df.to_csv(output_csv_path, index=False)