# Clause Assembler
Post-processing pipeline for GA output. Splits AI text into clauses via Gemini, applies humanizing transforms (comma insertion, parallel structure breaking, punctuation randomization via monte carlo), writes to output.txt.

Imports

In [1]:
import os
import json
import math
import time
import numpy as np
from dotenv import load_dotenv
from google import genai
from google.api_core import exceptions
import warnings
import sys

warnings.filterwarnings("ignore")

try:
    sys.stdout.reconfigure(encoding='utf-8')
    sys.stderr.reconfigure(encoding='utf-8')
except Exception:
    pass



Config -- paths, API key, model name

In [2]:
current_dir = os.getcwd()
notebook_dir = os.path.dirname(os.path.abspath("Clause_Assembler.ipynb")) if os.path.exists("Clause_Assembler.ipynb") else current_dir

INPUT_FILE = os.path.join(notebook_dir, "input.txt")
OUTPUT_FILE = os.path.join(notebook_dir, "output.txt")

project_root = os.path.dirname(os.path.dirname(os.path.dirname(notebook_dir)))
ENV_PATH = os.path.join(project_root, ".env")

load_dotenv(ENV_PATH)
API_KEY = os.getenv("GEMINI_API_KEY")

if not API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env file!")

client = genai.Client(api_key=API_KEY)
MODEL_NAME = "gemini-2.5-flash"

Gemini API call w/ retry + rate limit handling

In [3]:
def call_gemini_api(prompt, max_retries=5, return_json=False):
    attempt = 0
    
    while True:
        try:
            config = {}
            if return_json:
                config["response_mime_type"] = "application/json"
            
            response = client.models.generate_content(
                model=MODEL_NAME, 
                contents=prompt,
                config=config
            )
            
            if hasattr(response, "text") and response.text:
                result = response.text
            else:
                result = str(response)
            
            if return_json:
                try:
                    return json.loads(result)
                except json.JSONDecodeError as e:
                    print(f"JSON parse error: {e}")
                    print(f"Raw response: {result[:200]}...")
                    return None
            
            return result

        except Exception as e:
            err_str = str(e)
            
            if "429" in err_str or "ResourceExhausted" in str(type(e)) or "Quota exceeded" in err_str:
                wait_time = 30.0
                print(f"Rate limit hit. Waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            
            attempt += 1
            if attempt >= max_retries:
                print(f"Failed after {max_retries} attempts: {e}")
                return None
            
            backoff_time = (2 ** attempt) + np.random.uniform(0, 1)
            print(f"API error: {e}. Retrying in {backoff_time:.1f}s...")
            time.sleep(backoff_time)

Thought unit extraction -- Gemini parses text into weighted clauses w/ prosodic resolution flags

In [4]:
def get_thought_units(paragraph):
    prompt = f"""Analyze this paragraph and break it into atomic "Thought Units" (clauses or phrases that hold a single piece of information).

For each unit, provide:
1. "text": The exact substring (preserve original wording)
2. "weight": Integer 1-10 representing Mental Load or information density
   - High (8-10): Complex concepts, dense information, technical terms
   - Medium (4-7): Standard clauses with moderate complexity
   - Low (1-3): Simple connectors, transitions, basic statements
3. "resolves_thought": Boolean - does this clause sound like the end of an idea?
   - True: Falling intonation, feels complete, prosodic resolution
   - False: Rising or level intonation, feels unresolved, needs continuation

PARAGRAPH:
{paragraph}

Return ONLY valid JSON in this exact format:
[
  {{"text": "clause text here", "weight": 5, "resolves_thought": false}},
  {{"text": "next clause", "weight": 7, "resolves_thought": true}}
]

Do not include any explanation, just the JSON array."""

    result = call_gemini_api(prompt, return_json=True)
    
    if result is None or not isinstance(result, list):
        print("Failed to parse thought units, using fallback")
        sentences = paragraph.replace('?', '.').replace('!', '.').split('.')
        return [
            {"text": s.strip(), "weight": 5, "resolves_thought": True} 
            for s in sentences if s.strip()
        ]
    
    return result

Transforms: thinking commas, parallel structure breaking, log-normal monte carlo punctuation

In [5]:
def add_thinking_commas(text):
    words = text.split()
    if len(words) <= 3:
        return text
    
    if np.random.random() > 0.6:
        return text
    
    num_commas = np.random.choice([1, 2, 3], p=[0.5, 0.3, 0.2])
    
    for _ in range(num_commas):
        if len(words) <= 3:
            break
        insert_pos = np.random.randint(1, len(words) - 1)
        words[insert_pos] = ', ' + words[insert_pos]
    
    return ' '.join(words).replace(' , ', ', ')


def break_parallel_structure(text):
    if ', and' in text or ', or' in text:
        if np.random.random() < 0.5:
            text = text.replace(', and', '. and', 1)
        if ', or' in text and np.random.random() < 0.5:
            text = text.replace(', or', '. or', 1)
    
    return text


def monte_carlo_punctuation(thought_units, casual_mode=True):
    if casual_mode:
        MU = 1.8
        SIGMA = 1.2
    else:
        MU = 2.8
        SIGMA = 0.8
    
    reconstructed_text = ""
    current_sentence_load = 0
    current_word_count = 0
    
    target_load = np.random.lognormal(MU, SIGMA)
    
    sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
    
    for i, unit in enumerate(thought_units):
        text = unit['text'].strip()
        weight = unit['weight']
        resolves = unit['resolves_thought']
        
        if not text:
            continue
        
        text = text.rstrip('.,;:!?')
        
        if casual_mode and len(text.split()) > 4:
            if np.random.random() < 0.6:
                text = add_thinking_commas(text)
        
        text = break_parallel_structure(text)
        
        reconstructed_text += text
        current_sentence_load += weight
        current_word_count += len(text.split())
        
        if i == len(thought_units) - 1:
            if casual_mode and np.random.random() > 0.5:
                pass
            else:
                reconstructed_text += "."
            break
        
        if sentence_mode == 'fragment':
            if current_word_count >= 2 and np.random.random() < 0.6:
                reconstructed_text += ". "
                current_sentence_load = 0
                current_word_count = 0
                target_load = np.random.lognormal(MU, SIGMA)
                sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
                continue
        
        elif sentence_mode == 'runon':
            if current_word_count >= 15 and current_word_count <= 30:
                if np.random.random() < 0.2:
                    reconstructed_text += ". "
                    current_sentence_load = 0
                    current_word_count = 0
                    target_load = np.random.lognormal(MU, SIGMA)
                    sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
                    continue
            
            if current_word_count < 40:
                reconstructed_text += ", "
                continue
            elif current_word_count >= 40 and current_word_count < 65:
                if np.random.random() < 0.25:
                    reconstructed_text += ". "
                    current_sentence_load = 0
                    current_word_count = 0
                    target_load = np.random.lognormal(MU, SIGMA)
                    sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
                else:
                    reconstructed_text += ", "
                continue
            else:
                reconstructed_text += ". "
                current_sentence_load = 0
                current_word_count = 0
                target_load = np.random.lognormal(MU, SIGMA)
                sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
                continue
        
        load_ratio = current_sentence_load / target_load
        
        p_stop = 1 / (1 + math.exp(-6 * (load_ratio - 0.7)))
        
        if resolves:
            p_stop += 0.1
        else:
            p_stop -= 0.05
        
        if np.random.random() < 0.15:
            p_stop += 0.6
        
        p_stop = max(0.0, min(1.0, p_stop))
        
        monte_carlo_roll = np.random.random()
        
        if monte_carlo_roll < p_stop:
            reconstructed_text += ". "
            
            current_sentence_load = 0
            current_word_count = 0
            target_load = np.random.lognormal(MU, SIGMA)
            sentence_mode = np.random.choice(['fragment', 'runon', 'normal'], p=[0.25, 0.25, 0.5])
            
        else:
            if current_sentence_load > (target_load * 0.25):
                reconstructed_text += ", "
            else:
                if np.random.random() < 0.25:
                    reconstructed_text += ", "
                else:
                    reconstructed_text += " "
    
    return reconstructed_text.strip()

Main workflow -- deconstruct, reassemble

In [6]:
def humanize_workflow(input_text, casual_mode=True, verbose=True):
    if verbose:
        print(f"Input length: {len(input_text)} chars")
        print(f"Mode: {'casual' if casual_mode else 'formal'}")
    
    thought_units = get_thought_units(input_text)
    
    if verbose:
        print(f"Detected {len(thought_units)} thought units")
        for i, unit in enumerate(thought_units[:3]):
            print(f"  [{i+1}] \"{unit['text'][:50]}...\"")
            print(f"      Weight: {unit['weight']}/10, Resolves: {unit['resolves_thought']}")
        if len(thought_units) > 3:
            print(f"  ... and {len(thought_units) - 3} more")
    
    output_text = monte_carlo_punctuation(thought_units, casual_mode=casual_mode)
    
    if verbose:
        print(f"Reassembled {len(output_text)} chars")
    
    return output_text

Load input text

In [7]:
try:
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        input_text = f.read().strip()
    print(f"Loaded {len(input_text)} chars from {INPUT_FILE}")
except FileNotFoundError:
    input_text = """Furthermore, the existential dimension of truth and reality cannot be ignored. For thinkers like Søren Kierkegaard, truth is not merely a matter of factual accuracy but of lived authenticity. He famously asserted that "subjectivity is truth," implying that the most important truths are those that a person is willing to live and die for."""
    with open(INPUT_FILE, 'w', encoding='utf-8') as f:
        f.write(input_text)
    print("Created sample input file")

print(input_text)

Loaded 1006 chars from w:\Programming\PKOG\preprecogclean\imposter\clauses_assemble\input.txt
Furthermore, the existential dimension of truth and reality cannot be ignored. For thinkers like Søren Kierkegaard, truth is not merely a matter of factual accuracy but of lived authenticity. He famously asserted that "subjectivity is truth," implying that the most important truths are those that a person is willing to live and die for. This existential truth is found in the alignment of one’s actions with their deepest convictions, creating a sense of internal reality that provides meaning in an otherwise indifferent universe. In this sense, the quest for truth is not just an academic exercise but a moral imperative. To seek the truth is to honor the reality of our existence and to refuse the comfort of convenient delusions. Ultimately, the journey toward understanding truth and reality is the defining narrative of the human species. We are the only creatures known to ask "why" and "is it tru

Run pipeline

In [8]:
output_text = humanize_workflow(input_text, casual_mode=True, verbose=True)

print("\nOUTPUT:")
print("-" * 60)
print(output_text)

Input length: 1006 chars
Mode: casual
Detected 23 thought units
  [1] "Furthermore,..."
      Weight: 2/10, Resolves: False
  [2] "the existential dimension of truth and reality can..."
      Weight: 8/10, Resolves: True
  [3] "For thinkers like Søren Kierkegaard,..."
      Weight: 4/10, Resolves: False
  ... and 20 more
Reassembled 1035 chars

OUTPUT:
------------------------------------------------------------
Furthermore, the existential dimension of truth and reality cannot be ignored, For thinkers like Søren Kierkegaard. truth is not merely a matter of factual accuracy. but of lived authenticity, He, famously asserted that "subjectivity is truth,". implying that the most important truths are those, that a person is willing to, live and die for. This existential truth is found in the alignment of one’s actions with their deepest convictions, creating a sense of internal reality. that provides, meaning in an, , otherwise indifferent universe. In this sense. the quest for truth is no

Write output

In [9]:
try:
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write(output_text)
    print(f"Output saved to: {OUTPUT_FILE}")
    print(f"  Length: {len(output_text)} characters")
except Exception as e:
    print(f"Error writing output: {e}")

Output saved to: w:\Programming\PKOG\preprecogclean\imposter\clauses_assemble\output.txt
  Length: 1035 characters


Punctuation stats -- compare original vs humanized

In [10]:
def analyze_punctuation(text, label="Text"):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    words_per_sentence = [len(s.split()) for s in sentences]
    
    print(f"\n{label}:")
    print(f"  Total characters: {len(text)}")
    print(f"  Sentences: {len(sentences)}")
    print(f"  Periods: {text.count('.')}")
    print(f"  Commas: {text.count(',')}")
    
    if words_per_sentence:
        print(f"  Avg words/sentence: {np.mean(words_per_sentence):.1f}")
        print(f"  Sentence length std dev: {np.std(words_per_sentence):.1f}")
        print(f"  Min/Max sentence length: {min(words_per_sentence)}/{max(words_per_sentence)} words")

analyze_punctuation(input_text, "ORIGINAL")
analyze_punctuation(output_text, "HUMANIZED")


ORIGINAL:
  Total characters: 1006
  Sentences: 8
  Periods: 8
  Commas: 8
  Avg words/sentence: 20.2
  Sentence length std dev: 5.6
  Min/Max sentence length: 11/29 words

HUMANIZED:
  Total characters: 1035
  Sentences: 10
  Periods: 10
  Commas: 32
  Avg words/sentence: 16.5
  Sentence length std dev: 10.8
  Min/Max sentence length: 3/43 words


Sentence length distribution -- fragment/normal/run-on breakdown

In [11]:
def analyze_sentence_distribution(text, label="Text"):
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    lengths = [len(s.split()) for s in sentences]
    
    if not lengths:
        print(f"{label}: No sentences found")
        return
    
    fragments = [l for l in lengths if l <= 8]
    runons = [l for l in lengths if l >= 35]
    normal = [l for l in lengths if 8 < l < 35]
    
    total = len(lengths)
    pct_fragments = (len(fragments) / total) * 100
    pct_runons = (len(runons) / total) * 100
    pct_extremes = pct_fragments + pct_runons
    
    print(f"\n{label} - SENTENCE LENGTH DISTRIBUTION")
    print(f"{'='*60}")
    print(f"Total sentences: {total}")
    print(f"  Fragments (<=8 words):  {len(fragments):2d} ({pct_fragments:5.1f}%) - {fragments}")
    print(f"  Normal (9-34 words):   {len(normal):2d} ({len(normal)/total*100:5.1f}%)")
    print(f"  Run-ons (>=35 words):   {len(runons):2d} ({pct_runons:5.1f}%) - {runons}")
    print(f"  EXTREMES TOTAL:        {len(fragments)+len(runons):2d} ({pct_extremes:5.1f}%)")
    print(f"\n  Min: {min(lengths)} | Max: {max(lengths)} | Mean: {np.mean(lengths):.1f} | Std: {np.std(lengths):.1f}")
    
    if pct_extremes >= 40:
        print(f"Target met: {pct_extremes:.1f}% extremes (goal: 40%+)")
    else:
        print(f"Below target: {pct_extremes:.1f}% extremes (goal: 40%+)")
    
    return pct_extremes

analyze_sentence_distribution(input_text, "ORIGINAL")
analyze_sentence_distribution(output_text, "HUMANIZED")


ORIGINAL - SENTENCE LENGTH DISTRIBUTION
Total sentences: 8
  Fragments (<=8 words):   0 (  0.0%) - []
  Normal (9-34 words):    8 (100.0%)
  Run-ons (>=35 words):    0 (  0.0%) - []
  EXTREMES TOTAL:         0 (  0.0%)

  Min: 11 | Max: 29 | Mean: 20.2 | Std: 5.6
Below target: 0.0% extremes (goal: 40%+)

HUMANIZED - SENTENCE LENGTH DISTRIBUTION
Total sentences: 10
  Fragments (<=8 words):   1 ( 10.0%) - [3]
  Normal (9-34 words):    8 ( 80.0%)
  Run-ons (>=35 words):    1 ( 10.0%) - [43]
  EXTREMES TOTAL:         2 ( 20.0%)

  Min: 3 | Max: 43 | Mean: 16.5 | Std: 10.8
Below target: 20.0% extremes (goal: 40%+)


20.0

Generate 3 variants to show monte carlo variance

In [12]:
thought_units = get_thought_units(input_text)

for i in range(3):
    variant = monte_carlo_punctuation(thought_units, casual_mode=True)
    
    print(f"VARIANT {i+1}:")
    print('-'*60)
    print(variant)
    print(f"Periods: {variant.count('.')} | Commas: {variant.count(',')}\n")

VARIANT 1:
------------------------------------------------------------
Furthermore, the existential dimension, of truth and reality cannot be ignored, For, thinkers, like Søren Kierkegaard, truth is not merely a matter of factual accuracy. but of lived authenticity. He famously asserted that "subjectivity, is truth,", implying that the most important truths are those, that a person is willing to live and die for. This existential truth is found in the alignment of one’s actions with their deepest convictions. creating a sense of internal reality. that provides meaning in an otherwise indifferent universe. In this sense, the quest for truth is not just an academic exercise, but a moral imperative, To seek the truth is to honor the reality of our existence, and to refuse the comfort of convenient delusions, Ultimately, the journey toward understanding truth. and reality, is the defining narrative of the human species. We are the only creatures known to ask "why" and "is it true?", This 