# üìù Text Expander - Document-Based Sentence Expansion

**Expand sentences into paragraphs based on your novel/document vocabulary.**

This notebook uses **Markov Chain** and **TF-IDF Similarity** techniques (no AI/LLM required).

---

## How It Works

1. **Upload** your markdown document (novel chapters)
2. **Process** - The system learns word patterns and relationships
3. **Input** a sentence
4. **Output** - Get a coherent paragraph based on your document's style

---

## Step 1: Install & Import Libraries

Run this cell first to set up the environment.

In [None]:
# Standard libraries (no installation needed)
import re
import random
import string
import math
from collections import defaultdict, Counter
from pathlib import Path
from google.colab import files
import io

print("‚úÖ Libraries imported successfully!")

## Step 2: Define the Text Expander Classes

This cell contains all the core logic for text processing and expansion.

In [None]:
class DocumentProcessor:
    """Process and clean markdown documents"""
    
    def __init__(self, text: str = None, filepath: str = None):
        self.filepath = filepath
        self.raw_text = text if text else ""
        self.sentences = []
        self.words = []
        self.paragraphs = []
        
    def load_document(self) -> str:
        """Load markdown file"""
        if self.filepath:
            with open(self.filepath, 'r', encoding='utf-8') as f:
                self.raw_text = f.read()
        return self.raw_text
    
    def clean_markdown(self, text: str) -> str:
        """Remove markdown syntax"""
        # Remove headers
        text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
        # Remove bold/italic
        text = re.sub(r'\*{1,3}(.*?)\*{1,3}', r'\1', text)
        text = re.sub(r'_{1,3}(.*?)_{1,3}', r'\1', text)
        # Remove links
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
        # Remove images
        text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)
        # Remove code blocks
        text = re.sub(r'```[\s\S]*?```', '', text)
        text = re.sub(r'`([^`]+)`', r'\1', text)
        # Remove horizontal rules
        text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
        # Remove blockquotes
        text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
        # Remove list markers
        text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
        text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
        
        return text
    
    def extract_sentences(self, text: str) -> list:
        """Extract sentences from text"""
        # Handle common abbreviations to avoid false splits
        text = re.sub(r'Mr\.', 'Mr', text)
        text = re.sub(r'Mrs\.', 'Mrs', text)
        text = re.sub(r'Ms\.', 'Ms', text)
        text = re.sub(r'Dr\.', 'Dr', text)
        text = re.sub(r'Prof\.', 'Prof', text)
        text = re.sub(r'St\.', 'St', text)
        
        # Split on sentence-ending punctuation
        sentences = re.split(r'(?<=[.!?])\s+', text)
        # Clean and filter empty sentences
        sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 15]
        return sentences
    
    def extract_words(self, text: str) -> list:
        """Extract words from text (English optimized)"""
        # Extract words including contractions
        words = re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", text.lower())
        return words
    
    def extract_paragraphs(self, text: str) -> list:
        """Extract paragraphs from text"""
        paragraphs = re.split(r'\n\s*\n', text)
        paragraphs = [p.strip() for p in paragraphs if p.strip() and len(p.strip()) > 50]
        return paragraphs
    
    def process(self) -> dict:
        """Process the complete document"""
        if self.filepath and not self.raw_text:
            self.load_document()
        
        cleaned = self.clean_markdown(self.raw_text)
        
        self.sentences = self.extract_sentences(cleaned)
        self.words = self.extract_words(cleaned)
        self.paragraphs = self.extract_paragraphs(cleaned)
        
        return {
            'sentences': self.sentences,
            'words': self.words,
            'paragraphs': self.paragraphs,
            'word_count': len(self.words),
            'sentence_count': len(self.sentences),
            'unique_words': len(set(self.words))
        }


class MarkovChain:
    """Markov Chain for text generation"""
    
    def __init__(self, order: int = 2):
        self.order = order  # N-gram order
        self.chain = defaultdict(list)
        self.starters = []  # Sentence starting words
        
    def train(self, sentences: list):
        """Train the model from sentences"""
        for sentence in sentences:
            words = sentence.split()
            if len(words) < self.order + 1:
                continue
            
            # Store starter (sentence beginning words)
            starter = tuple(words[:self.order])
            self.starters.append(starter)
            
            # Build chain
            for i in range(len(words) - self.order):
                key = tuple(words[i:i + self.order])
                next_word = words[i + self.order]
                self.chain[key].append(next_word)
    
    def generate(self, seed_words: list = None, max_words: int = 50) -> str:
        """Generate text from seed words"""
        if seed_words and len(seed_words) >= self.order:
            current = self._find_matching_key(seed_words)
        else:
            if not self.starters:
                return ""
            current = random.choice(self.starters)
        
        if not current:
            return ""
            
        result = list(current)
        
        for _ in range(max_words - self.order):
            if current not in self.chain:
                current = self._find_similar_key(current)
                if not current:
                    break
            
            next_words = self.chain.get(current, [])
            if not next_words:
                break
                
            next_word = random.choice(next_words)
            result.append(next_word)
            current = tuple(result[-self.order:])
            
            # Stop at sentence-ending punctuation
            if next_word.endswith(('.', '!', '?')):
                break
        
        return ' '.join(result)
    
    def _find_matching_key(self, words: list) -> tuple:
        """Find a key matching the given words"""
        words_lower = [w.lower() for w in words]
        
        # Try exact match
        for i in range(len(words_lower) - self.order + 1):
            key = tuple(words_lower[i:i + self.order])
            if key in self.chain:
                return key
        
        # Try partial match
        for key in self.chain.keys():
            key_lower = tuple(w.lower() for w in key)
            if any(w in key_lower for w in words_lower):
                return key
        
        return random.choice(self.starters) if self.starters else None
    
    def _find_similar_key(self, current: tuple) -> tuple:
        """Find a similar key"""
        current_lower = tuple(w.lower() for w in current)
        
        for key in self.chain.keys():
            key_lower = tuple(w.lower() for w in key)
            if any(w in current_lower for w in key_lower):
                return key
        
        return random.choice(self.starters) if self.starters else None


class SimilarityFinder:
    """Find similar sentences/paragraphs using TF-IDF"""
    
    # Common English stop words to reduce noise
    STOP_WORDS = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
        'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
        'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
        'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it',
        'we', 'they', 'what', 'which', 'who', 'whom', 'whose', 'where',
        'when', 'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more',
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
        'same', 'so', 'than', 'too', 'very', 'just', 'also', 'now', 'here',
        'there', 'then', 'once', 'her', 'his', 'him', 'my', 'your', 'our',
        'their', 'its', 'me', 'us', 'them', 'into', 'through', 'during',
        'before', 'after', 'above', 'below', 'between', 'under', 'again',
        'further', 'while', 'about', 'against', 'being', 'having', 'doing'
    }
    
    def __init__(self, sentences: list, paragraphs: list):
        self.sentences = sentences
        self.paragraphs = paragraphs
        self.word_idf = {}
        self._calculate_idf()
    
    def _calculate_idf(self):
        """Calculate IDF for each word"""
        doc_count = len(self.sentences)
        word_doc_count = defaultdict(int)
        
        for sentence in self.sentences:
            words = set(re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", sentence.lower()))
            # Filter stop words
            words = words - self.STOP_WORDS
            for word in words:
                word_doc_count[word] += 1
        
        for word, count in word_doc_count.items():
            self.word_idf[word] = math.log(doc_count / (1 + count))
    
    def _get_tfidf_vector(self, text: str) -> dict:
        """Get TF-IDF vector for text"""
        words = re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", text.lower())
        # Filter stop words
        words = [w for w in words if w not in self.STOP_WORDS]
        word_count = Counter(words)
        total_words = len(words)
        
        vector = {}
        for word, count in word_count.items():
            tf = count / total_words if total_words > 0 else 0
            idf = self.word_idf.get(word, 0)
            vector[word] = tf * idf
        
        return vector
    
    def _cosine_similarity(self, vec1: dict, vec2: dict) -> float:
        """Calculate cosine similarity between two vectors"""
        common_words = set(vec1.keys()) & set(vec2.keys())
        
        if not common_words:
            return 0.0
        
        dot_product = sum(vec1[w] * vec2[w] for w in common_words)
        norm1 = math.sqrt(sum(v ** 2 for v in vec1.values()))
        norm2 = math.sqrt(sum(v ** 2 for v in vec2.values()))
        
        if norm1 == 0 or norm2 == 0:
            return 0.0
        
        return dot_product / (norm1 * norm2)
    
    def find_similar_sentences(self, query: str, top_n: int = 5) -> list:
        """Find sentences most similar to the query"""
        query_vec = self._get_tfidf_vector(query)
        
        similarities = []
        for sentence in self.sentences:
            sent_vec = self._get_tfidf_vector(sentence)
            sim = self._cosine_similarity(query_vec, sent_vec)
            similarities.append((sentence, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_n]
    
    def find_similar_paragraphs(self, query: str, top_n: int = 3) -> list:
        """Find paragraphs most similar to the query"""
        query_vec = self._get_tfidf_vector(query)
        
        similarities = []
        for paragraph in self.paragraphs:
            para_vec = self._get_tfidf_vector(paragraph)
            sim = self._cosine_similarity(query_vec, para_vec)
            similarities.append((paragraph, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_n]


class TextExpander:
    """Main class for expanding sentences into paragraphs"""
    
    def __init__(self, text: str = None, filepath: str = None):
        self.text = text
        self.filepath = filepath
        self.processor = None
        self.markov = None
        self.similarity = None
        self.data = None
        
    def initialize(self):
        """Initialize all components"""
        print("üìñ Loading document...")
        
        # Process document
        self.processor = DocumentProcessor(text=self.text, filepath=self.filepath)
        self.data = self.processor.process()
        
        print(f"   ‚úì {self.data['sentence_count']} sentences found")
        print(f"   ‚úì {self.data['word_count']} total words")
        print(f"   ‚úì {self.data['unique_words']} unique words")
        print(f"   ‚úì {len(self.data['paragraphs'])} paragraphs")
        
        # Initialize Markov Chain
        print("\nüîó Building Markov Chain...")
        self.markov = MarkovChain(order=2)
        self.markov.train(self.data['sentences'])
        print(f"   ‚úì Model trained with {len(self.markov.chain)} transitions")
        
        # Initialize Similarity Finder
        print("\nüîç Building similarity index...")
        self.similarity = SimilarityFinder(
            self.data['sentences'], 
            self.data['paragraphs']
        )
        print(f"   ‚úì IDF calculated for {len(self.similarity.word_idf)} words")
        
        print("\n‚úÖ System ready!\n")
    
    def expand(self, input_sentence: str, method: str = 'hybrid', 
               num_sentences: int = 4) -> str:
        """
        Expand input sentence into a paragraph
        
        Args:
            input_sentence: Input sentence
            method: Method to use ('markov', 'similarity', 'hybrid')
            num_sentences: Number of sentences in output
            
        Returns:
            Expanded paragraph
        """
        if method == 'markov':
            return self._expand_markov(input_sentence, num_sentences)
        elif method == 'similarity':
            return self._expand_similarity(input_sentence, num_sentences)
        else:  # hybrid
            return self._expand_hybrid(input_sentence, num_sentences)
    
    def _expand_markov(self, input_sentence: str, num_sentences: int) -> str:
        """Expand using pure Markov Chain"""
        words = input_sentence.split()
        sentences = [input_sentence]
        used_content = set()
        used_content.add(input_sentence.lower().strip())
        
        attempts = 0
        max_attempts = num_sentences * 8
        
        while len(sentences) < num_sentences and attempts < max_attempts:
            attempts += 1
            
            if attempts % 2 == 0 and len(sentences) > 1:
                random_sent = random.choice(sentences)
                words = random_sent.split()[-3:]
            else:
                words = sentences[-1].split()[-2:]
            
            generated = self.markov.generate(words, max_words=35)
            
            if generated:
                normalized = generated.lower().strip()
                
                is_duplicate = False
                for used in used_content:
                    gen_words = set(normalized.split())
                    used_words = set(used.split())
                    if gen_words and used_words:
                        overlap = len(gen_words & used_words) / min(len(gen_words), len(used_words))
                        if overlap > 0.6:
                            is_duplicate = True
                            break
                
                if not is_duplicate and len(generated.split()) > 4:
                    sentences.append(generated)
                    used_content.add(normalized)
        
        return ' '.join(sentences)
    
    def _expand_similarity(self, input_sentence: str, num_sentences: int) -> str:
        """Expand using similarity search"""
        similar = self.similarity.find_similar_sentences(input_sentence, num_sentences * 2)
        
        result = [input_sentence]
        used_words = set(input_sentence.lower().split())
        
        for sentence, score in similar:
            if len(result) >= num_sentences:
                break
            
            sent_words = set(sentence.lower().split())
            overlap = len(used_words & sent_words) / len(sent_words) if sent_words else 1
            
            if overlap < 0.7 and score > 0.1:
                result.append(sentence)
                used_words.update(sent_words)
        
        return ' '.join(result)
    
    def _expand_hybrid(self, input_sentence: str, num_sentences: int) -> str:
        """Expand using combination of Markov and Similarity"""
        result = [input_sentence]
        used_content = set()
        used_content.add(input_sentence.lower().strip())
        
        # Find relevant paragraphs and sentences for context
        similar_paras = self.similarity.find_similar_paragraphs(input_sentence, 3)
        similar_sents = self.similarity.find_similar_sentences(input_sentence, num_sentences * 3)
        
        # Gather candidate sentences from similarity
        candidate_sentences = []
        for sent, score in similar_sents:
            normalized = sent.lower().strip()
            if normalized not in used_content and score > 0.05:
                candidate_sentences.append((sent, score))
        
        # Use words from relevant paragraphs as additional seeds
        context_words = []
        for para, _ in similar_paras:
            words = re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", para.lower())
            context_words.extend(words[:30])
        
        # Generate sentences with combination of Markov and Similarity
        attempts = 0
        max_attempts = num_sentences * 5
        
        while len(result) < num_sentences and attempts < max_attempts:
            attempts += 1
            generated = None
            
            # Alternate between Markov and Similarity
            if attempts % 3 != 0:
                if context_words:
                    idx = random.randint(0, max(0, len(context_words) - 3))
                    temp_seed = context_words[idx:idx + 2]
                else:
                    temp_seed = result[-1].split()[-3:]
                
                generated = self.markov.generate(temp_seed, max_words=30)
            else:
                if candidate_sentences:
                    generated, _ = candidate_sentences.pop(0)
            
            if generated:
                normalized = generated.lower().strip()
                is_duplicate = False
                for used in used_content:
                    gen_words = set(normalized.split())
                    used_words = set(used.split())
                    if gen_words and used_words:
                        overlap = len(gen_words & used_words) / min(len(gen_words), len(used_words))
                        if overlap > 0.7:
                            is_duplicate = True
                            break
                
                if not is_duplicate and len(generated.split()) > 3:
                    result.append(generated)
                    used_content.add(normalized)
        
        # If still not enough, force add from similarity
        while len(result) < num_sentences and candidate_sentences:
            sent, _ = candidate_sentences.pop(0)
            if sent.lower().strip() not in used_content:
                result.append(sent)
                used_content.add(sent.lower().strip())
        
        return ' '.join(result)
    
    def analyze_input(self, input_sentence: str) -> dict:
        """Analyze input and show information"""
        words = re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", input_sentence.lower())
        
        doc_words = set(self.data['words'])
        matching_words = [w for w in words if w in doc_words]
        
        similar = self.similarity.find_similar_sentences(input_sentence, 3)
        
        return {
            'input_words': len(words),
            'matching_words': matching_words,
            'match_ratio': len(matching_words) / len(words) if words else 0,
            'similar_sentences': similar
        }


print("‚úÖ Text Expander classes defined successfully!")

## Step 3: Upload Your Markdown Document

Upload your novel/document in `.md` format.

In [None]:
# Upload your markdown file
print("üì§ Please upload your markdown (.md) document:")
uploaded = files.upload()

# Get the uploaded file content
document_text = ""
filename = ""

for fn, content in uploaded.items():
    filename = fn
    document_text = content.decode('utf-8')
    print(f"\n‚úÖ File '{fn}' uploaded successfully!")
    print(f"   Size: {len(content):,} bytes")
    print(f"   Characters: {len(document_text):,}")

## Step 4: Initialize the Text Expander

Process your document and build the language model.

In [None]:
# Initialize the Text Expander with your document
expander = TextExpander(text=document_text)
expander.initialize()

## Step 5: Expand Sentences! üöÄ

Now you can expand sentences into paragraphs.

**Available Methods:**
- `'hybrid'` (default) - Best results, combines both techniques
- `'markov'` - Uses word transition patterns
- `'similarity'` - Finds similar sentences from document

In [None]:
#@title üñäÔ∏è Enter Your Sentence { run: "auto", display-mode: "form" }

input_sentence = "The sun rose over the mountains"  #@param {type:"string"}
method = "hybrid"  #@param ["hybrid", "markov", "similarity"]
num_sentences = 4  #@param {type:"slider", min:2, max:8, step:1}

print(f"üìù Input: {input_sentence}")
print(f"‚öôÔ∏è  Method: {method}")
print(f"üìä Output sentences: {num_sentences}")
print("\n" + "="*60)

result = expander.expand(input_sentence, method=method, num_sentences=num_sentences)

print("\nüìÑ OUTPUT:")
print("-"*60)
# Pretty print the paragraph
import textwrap
wrapped = textwrap.fill(result, width=70)
print(wrapped)
print("-"*60)

## Step 6: Analyze Your Input (Optional)

See how well your input matches the document vocabulary.

In [None]:
#@title üîç Analyze Input Sentence { run: "auto", display-mode: "form" }

analyze_sentence = "The hero walked through the forest"  #@param {type:"string"}

analysis = expander.analyze_input(analyze_sentence)

print(f"üìä Analysis for: '{analyze_sentence}'")
print("="*60)
print(f"\nüìå Total words: {analysis['input_words']}")
print(f"üìå Words found in document: {len(analysis['matching_words'])}")
print(f"üìå Match ratio: {analysis['match_ratio']:.1%}")
print(f"\nüìå Matching words: {', '.join(analysis['matching_words'][:15])}")

print("\nüìå Similar sentences from document:")
print("-"*60)
for i, (sent, score) in enumerate(analysis['similar_sentences'], 1):
    print(f"\n{i}. [Score: {score:.3f}]")
    print(f"   {sent[:100]}{'...' if len(sent) > 100 else ''}")

## üîÑ Quick Expansion Function

Use this for quick expansions without the form interface.

In [None]:
def expand_sentence(sentence, method='hybrid', num_sentences=4):
    """
    Quick function to expand a sentence.
    
    Args:
        sentence: Your input sentence
        method: 'hybrid', 'markov', or 'similarity'
        num_sentences: Number of sentences in output (2-8)
    """
    result = expander.expand(sentence, method=method, num_sentences=num_sentences)
    print(f"\nüìù Input: {sentence}")
    print(f"\nüìÑ Output ({num_sentences} sentences, {method} method):")
    print("-"*60)
    print(textwrap.fill(result, width=70))
    print("-"*60)
    return result

# Example usage:
# expand_sentence("The night was dark and cold")
# expand_sentence("She opened the ancient book", method='similarity', num_sentences=5)

## üìù Try It Out!

Modify the sentences below and run the cell.

In [None]:
# Try your own sentences here!

# Example 1 - Hybrid method (default, best results)
expand_sentence("The journey had just begun")

print("\n" + "="*70 + "\n")

# Example 2 - Using Markov Chain only
expand_sentence("He walked through the darkness", method='markov')

print("\n" + "="*70 + "\n")

# Example 3 - Using Similarity only
expand_sentence("The secret was finally revealed", method='similarity', num_sentences=5)

---

## üìö Tips for Best Results

1. **Use vocabulary from your document** - The system works best when your input contains words that appear in the source document.

2. **Try different methods:**
   - `hybrid` - Best overall results
   - `markov` - More creative/random output
   - `similarity` - Closest to original document style

3. **Longer documents = better results** - More text provides more patterns to learn.

4. **Analyze first** - Use the analysis tool to check if your input words exist in the document.

---

**No AI/LLM used!** This system uses statistical methods (Markov Chain + TF-IDF) only.