# Kaisha Ningen - Company Man
## This is a project to create a Contextual Retrieval Agent 会社人間 that can answer questions about the company based on meeting notes captured by Supernormal and Otter.AI.

This notebook performs batch processing on the Markdown files to include data cleaning. The intent is to prepare these files for vectorization into a database like [*Qdrant*](https://qdrant.tech/).



In [None]:
import re
import os
from datetime import datetime

# Install necessary libraries if not already installed
try:
    import markdown
except ImportError:
    !pip install markdown
    import markdown

In [None]:
# Define the main directory and the sub-directory to exclude
directory_path = '/content/drive/MyDrive/Supernormal-Transcripts' # Define the path of the input directory
exclude_subdirectory = '/content/drive/MyDrive/Supernormal-Transcripts/processed-files' # Define excluded directories if necessary
output_directory = '/content/drive/MyDrive/Supernormal-Transcripts/processed-files'  # Define output directory

In [None]:
# Get all .md files in the directory, excluding those in the processed-files sub-directory
file_paths = [
    os.path.join(root, file)
    for root, dirs, files in os.walk(directory_path)
    if exclude_subdirectory not in root  # Exclude files from the 'processed-files' directory - if you use Google Drive, edit as needed
    for file in files
    if file.endswith('.md')
]

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def standardize_dates(text):
    date_pattern = r'\b(\d{4}-\d{2}-\d{2})\b'
    time_pattern = r'\b(\d{2}:\d{2})\b'

    standardized_text = re.sub(date_pattern, lambda x: datetime.strptime(x.group(), '%Y-%m-%d').strftime('%Y-%m-%d'), text)
    standardized_text = re.sub(time_pattern, lambda x: datetime.strptime(x.group(), '%H:%M').strftime('%H:%M'), standardized_text)

    return standardized_text


In [None]:
processed_files = []
# Loop through the files and process them
for file_path in file_paths:
    with open(file_path, 'r') as file:
        # Read the content from the file
        content = file.read()

        # Step 1: Clean text
        content = clean_text(content)
        # Step 2: Standardize dates and times
        content = standardize_dates(content)
        processed_files.append((content, file_path)) # store processed files in a list of (content, filepath) pairs.


In [None]:
# Function to convert filename to kebab-case
def convert_to_kebab_case(filename):
    kebab_name = filename.lower().replace(' ', '-').replace('_', '-').replace('.md', '.txt')
    return kebab_name

# Save each processed file with kebab-case filenames in the 'processed-files' subfolder
for content, path in processed_files:
    original_filename = os.path.basename(path)
    new_filename = convert_to_kebab_case(original_filename)
    output_path = os.path.join(output_directory, new_filename)  # Use output_directory

    # Write the processed content to a new .txt file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f'Saved cleaned file as {output_path}')
