# Task 0: Human Data Pipeline (Class 1)

Processes raw Project Gutenberg essays into a balanced parquet dataset.

**Authors:** Francis Bacon (1597), Ralph Waldo Emerson (1841), William James (1907), Bertrand Russell (1912)

**Pipeline:** Remove metadata/artifacts → Split into chapter blocks → Assign topics → Sentence-tokenize and chunk to 100–200 words → Balance to 125 samples per author → Export as `human_class1.parquet`

In [None]:
import os
import uuid
import re
import pandas as pd
import nltk

def ensure_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
        nltk.download('punkt_tab', quiet=True)

ensure_nltk_resources()

In [None]:
BASE_DIR = os.path.abspath("..")
RAW_DIR = os.path.join(BASE_DIR, "raw")
PROCESSED_DIR = os.path.join(BASE_DIR, "processed")

if not os.path.exists(PROCESSED_DIR):
    os.makedirs(PROCESSED_DIR)

BOOKS = [
    {"filename": "FrancisBacon.txt", "author": "Francis Bacon", "title": "The Essays", "source_identity": "Bacon", "allowed_topics": None},
    {"filename": "RalphWaldo.txt", "author": "Ralph Waldo Emerson", "title": "Essays: First Series", "source_identity": "Emerson", "allowed_topics": None},
    {"filename": "WilliamJames.txt", "author": "William James", "title": "Pragmatism", "source_identity": "James_1907", "allowed_topics": None},
    {"filename": "BertrandRussell.txt", "author": "Bertrand Russell", "title": "The Problems of Philosophy", "source_identity": "Russell_1912", "allowed_topics": None}
]

## Text Cleaning

Regex removal of Project Gutenberg headers/footers, `[Footnote]` markers, `_italics_` underscores, and excess whitespace.

In [None]:
def clean_gutenberg_header_footer(text):
    start_markers = [r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK .* \*\*\*", r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*"]
    end_markers = [r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK .* \*\*\*", r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*"]
    
    start_pos = 0
    end_pos = len(text)
    
    for marker in start_markers:
        match = re.search(marker, text, re.IGNORECASE)
        if match:
            start_pos = match.end()
            break
            
    for marker in end_markers:
        match = re.search(marker, text, re.IGNORECASE)
        if match:
            end_pos = match.start()
            break
            
    return text[start_pos:end_pos].strip()

def deep_clean_text(text):
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)
    text = text.replace('_', '')
    text = " ".join(text.split())
    return text

## Chapter Splitting

Split each book into logical chapters using author-specific formatting patterns, preserving semantic context before chunking.

In [None]:
def split_into_chapters(text, author):
    chapters = []

    if "Emerson" in author:
        titles = ["HISTORY", "SELF-RELIANCE", "COMPENSATION", "SPIRITUAL LAWS", "LOVE", "FRIENDSHIP", "PRUDENCE", "HEROISM", "THE OVER-SOUL", "CIRCLES", "INTELLECT", "ART"]
        pattern = re.compile(r'(?:\n\r?|\r\n)\s*(' + '|'.join([re.escape(t) for t in titles]) + r')\s*(?:\n\r?|\r\n)', re.IGNORECASE)
        parts = pattern.split(text)
        if parts[0].strip(): chapters.append({'topic': 'Preface', 'text': parts[0].strip()})
        for i in range(1, len(parts), 2):
            if i + 1 < len(parts):
                chapters.append({'topic': parts[i].strip().title(), 'text': parts[i+1].strip()})

    elif "Bacon" in author:
        pattern = re.compile(r'(^Of [A-Za-z \-]+$)', re.MULTILINE)
        parts = pattern.split(text)
        if parts[0].strip(): chapters.append({'topic': 'Preface', 'text': parts[0].strip()})
        for i in range(1, len(parts), 2):
            if i + 1 < len(parts):
                content = parts[i+1].strip()
                if len(content.split()) > 50:
                    chapters.append({'topic': parts[i].strip().replace("Of ", "").title(), 'text': content})

    elif "Russell" in author:
        russell_titles = ["APPEARANCE AND REALITY", "THE EXISTENCE OF MATTER", "THE NATURE OF MATTER", "IDEALISM", "KNOWLEDGE BY ACQUAINTANCE", "ON INDUCTION", "ON OUR KNOWLEDGE OF GENERAL PRINCIPLES", "HOW A PRIORI KNOWLEDGE IS POSSIBLE", "THE WORLD OF UNIVERSALS", "ON OUR KNOWLEDGE OF UNIVERSALS", "ON INTUITIVE KNOWLEDGE", "TRUTH AND FALSEHOOD", "KNOWLEDGE ERROR AND PROBABLE OPINION", "THE LIMITS OF PHILOSOPHICAL KNOWLEDGE", "THE VALUE OF PHILOSOPHY"]
        for i, title in enumerate(russell_titles):
            start_idx = text.find(title)
            if start_idx == -1: continue
            end_idx = text.find(russell_titles[i+1]) if i + 1 < len(russell_titles) else len(text)
            if end_idx == -1: end_idx = len(text)
            content = text[start_idx + len(title):end_idx].strip()
            chapters.append({'topic': title.title(), 'text': content})

    elif "James" in author:
        james_titles = ["The Present Dilemma in Philosophy", "What Pragmatism Means", "Some Metaphysical Problems Pragmatically Considered", "The One and the Many", "Pragmatism and Common Sense", "Pragmatism's Conception of Truth", "Pragmatism and Humanism", "Pragmatism and Religion"]
        pattern_str = '(' + '|'.join([re.escape(t) for t in james_titles]) + ')'
        pattern = re.compile(pattern_str, re.IGNORECASE)
        matches = list(pattern.finditer(text))
        for i, match in enumerate(matches):
            start_pos = match.end()
            end_pos = matches[i+1].start() if i + 1 < len(matches) else len(text)
            chapters.append({'topic': match.group(1), 'text': text[start_pos:end_pos].strip()})

    else:
        chapters.append({'topic': 'Unknown', 'text': text})

    return chapters

## Chunking (100–200 Words)

Sentence-tokenize and accumulate chunks at 100–200 words (150 target). This strict window removes paragraph length as a discriminating feature.

In [None]:
def get_chunks(text, min_w=100, max_w=200):
    ensure_nltk_resources()
    sentences = nltk.sent_tokenize(text)
    current_chunk = []
    current_count = 0

    for sentence in sentences:
        w_count = len(sentence.split())

        if w_count > max_w:
            if current_count >= min_w:
                yield " ".join(current_chunk)
            current_chunk = []
            current_count = 0
            continue

        if current_count + w_count <= max_w:
            current_chunk.append(sentence)
            current_count += w_count
            if current_count >= 150:
                yield " ".join(current_chunk)
                current_chunk = []
                current_count = 0
        else:
            if current_count >= min_w:
                yield " ".join(current_chunk)
            current_chunk = [sentence]
            current_count = w_count

    if current_count >= min_w:
        yield " ".join(current_chunk)

def analyze_chunk(text):
    sentences = nltk.sent_tokenize(text)
    words = text.split()
    sent_count = len(sentences)
    return {"word_count": len(words), "avg_sent_len": len(words) / sent_count if sent_count else 0}

## Topic Mapping

Map chapter titles to six cohesive topics: Ethics & Conduct, General Philosophy, Mind & Knowledge, Religion & Spirit, Society & Politics, Truth & Reality.

In [None]:
TOPIC_MAP = {
    "Truth": "Truth & Reality", "Truth And Falsehood": "Truth & Reality", "Pragmatism'S Conception Of Truth": "Truth & Reality", "The Conception Of Truth": "Truth & Reality", "Appearance And Reality": "Truth & Reality", "The Existence Of Matter": "Truth & Reality", "The Nature Of Matter": "Truth & Reality", "What Pragmatism Means": "Truth & Reality",
    "Intellect": "Mind & Knowledge", "Knowledge By Acquaintance": "Mind & Knowledge", "On Our Knowledge Of General Principles": "Mind & Knowledge", "How A Priori Knowledge Is Possible": "Mind & Knowledge", "On Intuitive Knowledge": "Mind & Knowledge", "The Limits Of Philosophical Knowledge": "Mind & Knowledge", "The World Of Universals": "Mind & Knowledge", "Circles": "Mind & Knowledge", "The Present Dilemma In Philosophy": "Mind & Knowledge",
    "Goodness And Goodness Of Nature": "Ethics & Conduct", "Adversity": "Ethics & Conduct", "Boldness": "Ethics & Conduct", "Envy": "Ethics & Conduct", "Revenge": "Ethics & Conduct", "Simulation And Dissimulation": "Ethics & Conduct", "Prudence": "Ethics & Conduct", "Heroism": "Ethics & Conduct", "Compensation": "Ethics & Conduct", "Self-Reliance": "Ethics & Conduct",
    "Judicature": "Society & Politics", "Empire": "Society & Politics", "Great Place": "Society & Politics", "Innovations": "Society & Politics", "Marriage And Single Life": "Society & Politics", "Parents And Children": "Society & Politics", "Friendship": "Society & Politics", "Manners": "Society & Politics", "Seditions And Troubles": "Society & Politics", "The True Greatness Kingdoms And Estates": "Society & Politics",
    "Unity In Religion": "Religion & Spirit", "Atheism": "Religion & Spirit", "Superstition": "Religion & Spirit", "The Over-Soul": "Religion & Spirit", "Spiritual Laws": "Religion & Spirit", "Pragmatism And Religion": "Religion & Spirit", "Some Metaphysical Problems Pragmatically Considered": "Religion & Spirit"
}

def map_topic(row):
    key = str(row['topic']).strip().title()
    return TOPIC_MAP.get(key, "General Philosophy")

## Process and Export

Read → clean → split → chunk → balance (125 per author) → save as `human_class1.parquet`.

In [None]:
all_chunks = []

for book in BOOKS:
    filepath = os.path.join(RAW_DIR, book["filename"])
    print(f"Processing {book['title']}...")

    try:
        with open(filepath, 'r', encoding='utf-8') as f: content = f.read()
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        continue

    clean_content = clean_gutenberg_header_footer(content)
    chapters = split_into_chapters(clean_content, book["author"])

    for chapter in chapters:
        clean_body = deep_clean_text(chapter['text'])
        chunks = get_chunks(clean_body)
        for chunk_text in chunks:
            features = analyze_chunk(chunk_text)
            all_chunks.append({
                "id": str(uuid.uuid4()),
                "text": chunk_text,
                "class": 1,
                "topic": chapter['topic'],
                "origin_ref": f"{book['title']}/{chapter['topic']}",
                "feature_cache": {
                    "author": book["source_identity"].split('_')[0],
                    "book_title": book["title"],
                    "word_count": features["word_count"],
                    "avg_sent_length": features["avg_sent_len"]
                }
            })

df = pd.DataFrame(all_chunks)

if not df.empty:
    final_dfs = []
    df['author_key'] = df['feature_cache'].apply(lambda x: x['author'])

    for author in df['author_key'].unique():
        sub_df = df[df['author_key'] == author]
        if len(sub_df) > 125:
            final_dfs.append(sub_df.sample(n=125, random_state=42))
        else:
            final_dfs.append(sub_df)

    final_df = pd.concat(final_dfs, ignore_index=True)
    final_df['topic'] = final_df.apply(map_topic, axis=1)
    final_df = final_df[['id', 'class', 'topic', 'text', 'feature_cache', 'origin_ref']]

    out_path = os.path.join(PROCESSED_DIR, "human_class1.parquet")
    final_df.to_parquet(out_path)
    print(f"Saved {len(final_df)} clean rows to {out_path}")
    print(final_df.head())
else:
    print("No data generated.")