In [None]:
from google.colab import files
import os
import nltk
from nltk.tokenize import sent_tokenize
import math

# Download required NLTK data
nltk.download('punkt')

def upload_file():
    """Upload a file using Google Colab's interface"""
    print("Please upload your text file...")
    uploaded = files.upload()
    return next(iter(uploaded))

def read_file(filename):
    """Read content from the uploaded file"""
    with open(filename, 'r', encoding='utf-8') as file:
        return file.read()

def simple_split(text):
    """Split text into 4 roughly equal parts"""
    total_length = len(text)
    chunk_size = math.ceil(total_length / 4)

    chunks = []
    for i in range(0, total_length, chunk_size):
        chunk = text[i:i + chunk_size]
        chunks.append(chunk)

    # Handle case where we got less than 4 chunks
    while len(chunks) < 4:
        chunks.append("")

    return chunks[:4]  # Ensure we only return 4 chunks

def semantic_split(text):
    """Split text into 4 chunks trying to maintain sentence boundaries"""
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    chunk_size = math.ceil(total_sentences / 4)

    chunks = []
    current_chunk = []

    for i, sentence in enumerate(sentences):
        current_chunk.append(sentence)

        if len(current_chunk) == chunk_size or i == total_sentences - 1:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    # Handle case where we got less than 4 chunks
    while len(chunks) < 4:
        chunks.append("")

    return chunks[:4]  # Ensure we only return 4 chunks

def save_chunks(chunks, original_filename):
    """Save chunks to separate files and prepare them for download"""
    base_name = os.path.splitext(original_filename)[0]
    saved_files = []

    for i, chunk in enumerate(chunks, 1):
        output_filename = f"{base_name}_part{i}.txt"
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(chunk)
        saved_files.append(output_filename)

    # Download the files
    for filename in saved_files:
        files.download(filename)

def main():
    # Upload file
    filename = upload_file()

    # Read content
    content = read_file(filename)

    # Ask user for splitting preference
    split_method = input("Enter 'semantic' for semantic splitting or 'simple' for equal parts splitting: ").lower()

    # Split content
    if split_method == 'semantic':
        chunks = semantic_split(content)
        print("\nSplitting text semantically (maintaining sentence boundaries)...")
    else:
        chunks = simple_split(content)
        print("\nSplitting text into equal parts...")

    # Save and download chunks
    save_chunks(chunks, filename)
    print("\nAll parts have been saved and downloaded!")

if __name__ == "__main__":
    main()

Please upload your text file...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saving chapter14.txt to chapter14.txt
Enter 'semantic' for semantic splitting or 'simple' for equal parts splitting: semantic

Splitting text semantically (maintaining sentence boundaries)...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


All parts have been saved and downloaded!
