<a href="https://colab.research.google.com/github/michaelwnau/consequential-products/blob/main/supernormal_data_cleaners.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import os
from datetime import datetime

# Install necessary libraries if not already installed
try:
    import markdown
except ImportError:
    !pip install markdown
    import markdown

In [None]:
# Specify file paths
file_paths = ['/content/drive/MyDrive/Supernormal-Transcripts/G1-Portal-Transition-1f82401e0cd64c51bd500aa790db728c.md']

# Read content of each file
file_contents = []
for path in file_paths:
    with open(path, 'r', encoding='utf-8') as f:
        file_contents.append(f.read())


In [None]:
# Function to clean special characters and lowercase text
def clean_text(text):
    # Remove special characters (except spaces and alphanumeric)
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text

# Function to standardize date and timestamp formats
def standardize_dates(text):
    # Example pattern for dates (e.g., 2023-12-31) and times (e.g., 14:30)
    date_pattern = r'\b(\d{4}-\d{2}-\d{2})\b'
    time_pattern = r'\b(\d{2}:\d{2})\b'

    # Standardize date format to YYYY-MM-DD
    standardized_text = re.sub(date_pattern, lambda x: datetime.strptime(x.group(), '%Y-%m-%d').strftime('%Y-%m-%d'), text)
    # Standardize time format to HH:MM
    standardized_text = re.sub(time_pattern, lambda x: datetime.strptime(x.group(), '%H:%M').strftime('%H:%M'), standardized_text)

    return standardized_text


In [None]:
processed_files = []
for content, path in zip(file_contents, file_paths):
    # Step 1: Clean text
    content = clean_text(content)
    # Step 2: Standardize dates and times
    content = standardize_dates(content)
    processed_files.append((content, path))


In [None]:
# Convert filename to kebab-case
def convert_to_kebab_case(filename):
    # Remove the extension and make lowercase
    kebab_name = filename.lower().replace(' ', '-').replace('_', '-').replace('.md', '.txt')
    return kebab_name

# Save each processed file
for content, path in processed_files:
    # Get the kebab-case filename
    original_filename = os.path.basename(path)
    new_filename = convert_to_kebab_case(original_filename)

    # Create the output directory if it doesn't exist
    output_dir = '/content/drive/MyDrive/Supernormal-Transcripts/processed-files'  # Or any other desired directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = f'{output_dir}/{new_filename}'

    # Write the processed content to a new .txt file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f'Saved cleaned file as {output_path}')
