In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/focus/

Mounted at /content/drive
/content/drive/MyDrive/focus


In [None]:
import re
import torch
import torch.nn.functional as F
from transformers import AutoModel  # using the same model you were using
from scipy.signal import find_peaks
from sklearn.cluster import DBSCAN
import numpy as np

##########################
# 1. Text Preprocessing  #
##########################
def preprocess_text(text: str) -> list:
    """
    Preprocess the text: remove newlines and split into sentences using punctuation.
    """
    text = text.replace('\n', ' ').strip()
    # Splitting on punctuation followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text)
    # Remove any empty strings
    sentences = [s for s in sentences if s]
    return sentences

##########################
# 2. Model Loading & Embedding #
##########################
def load_model():
    """
    Load the embedding model.
    """
    # Using the same model as in your original code:
    model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en", trust_remote_code=True)
    return model

def get_embeddings(text_list: list, model, max_length: int = 512):
    """
    Encode a list of texts into embeddings.
    """
    embeddings = model.encode(text_list, max_length=max_length, convert_to_tensor=True)
    return embeddings

##########################
# 3. Boundary Scoring    #
##########################
def compute_boundary_scores(embeddings: torch.Tensor, method: str = 'cosine') -> list:
    """
    Compute a dissimilarity score at every candidate boundary.
    For each boundary i (i=1...N-1), the left segment is sentences [0:i]
    and the right segment is sentences [i:].
    """
    scores = []
    num_sentences = embeddings.size(0)
    for i in range(1, num_sentences):
        # Compute the average (cumulative) embedding for left and right segments
        left_emb = torch.mean(embeddings[:i], dim=0)
        right_emb = torch.mean(embeddings[i:], dim=0)
        # Normalize if using cosine similarity
        if method == 'cosine':
            sim = F.cosine_similarity(left_emb.unsqueeze(0), right_emb.unsqueeze(0)).item()
            score = 1 - sim  # high score means high dissimilarity
        elif method == 'manhattan':
            score = torch.sum(torch.abs(left_emb - right_emb)).item()
        else:
            raise ValueError("Unsupported method. Choose 'cosine' or 'manhattan'.")
        scores.append(score)
    return scores

##########################
# 4. Boundary Detection  #
##########################
def detect_boundaries(scores: list, threshold: float = None, distance: int = 1) -> list:
    """
    Detect boundaries based on the dissimilarity scores.
    If a threshold is provided, all boundaries above the threshold are returned.
    Otherwise, use peak detection.
    """
    if threshold is not None:
        # Return boundary indices where score exceeds the threshold.
        boundaries = [i for i, score in enumerate(scores, start=1) if score >= threshold]
    else:
        # Use peak detection to identify candidate boundaries.
        # (scores[i] corresponds to the boundary between sentence i and i+1)
        peaks, _ = find_peaks(scores, distance=distance)
        # Adjust indices: score index i corresponds to boundary at sentence index i+1
        boundaries = (peaks + 1).tolist()
    return boundaries

##########################
# 5. Segment Printing    #
##########################
def print_segments(sentences: list, boundaries: list):
    """
    Print segmented text based on the detected boundary indices.
    """
    start = 0
    for boundary in boundaries:
        segment = " ".join(sentences[start:boundary])
        print("Segment:")
        print(segment)
        print("="*50)
        start = boundary
    # Print the last segment
    if start < len(sentences):
        segment = " ".join(sentences[start:])
        print("Segment:")
        print(segment)

##########################
# 7. Main Pipeline       #
##########################
def main(text: str):
    # Load the model
    model = load_model()

    # Preprocess text into sentences
    sentences = preprocess_text(text)
    print(f"Total sentences: {len(sentences)}")

    # Compute sentence embeddings
    embeddings = get_embeddings(sentences, model, max_length=512)

    # Compute boundary dissimilarity scores (using cosine similarity by default)
    scores = compute_boundary_scores(embeddings, method='manhattan')

    # Option 2: Use peak detection to determine boundaries automatically
    boundaries = detect_boundaries(scores, distance=1)
    print("Detected boundaries at sentence indices:", boundaries)

    # Print the segmented text
    print("\n--- Segmented Text ---")
    print_segments(sentences, boundaries)


##########################
# 8. Example Usage       #
##########################
if __name__ == '__main__':

    test_text = """
    Artificial General Intelligence (AGI) represents a pivotal milestone in the field of artificial intelligence, aiming to develop machines capable of performing any intellectual task that a human can. Unlike narrow AI, which excels in specific domains, AGI seeks to emulate the versatile and adaptive nature of human cognition. This ambition encompasses a broad spectrum of cognitive functions, including reasoning, problem-solving, learning, and understanding complex concepts across diverse fields. The pursuit of AGI is not merely a technological endeavor but also a profound exploration into the essence of intelligence itself, challenging our understanding of consciousness, ethics, and the future trajectory of human-machine collaboration.

    The journey toward AGI is deeply intertwined with the evolution of artificial intelligence. In the mid-20th century, pioneers like Alan Turing laid the groundwork for computational intelligence with the development of the Turing Test, a criterion for machine intelligence. The term "Artificial Intelligence" was coined in 1956 during the Dartmouth Conference, marking the formal inception of AI as a field of study. Early AI research focused on symbolic reasoning and problem-solving, leading to the creation of expert systems in the 1970s and 1980s. However, these systems were limited by their reliance on predefined rules and lacked the ability to learn from experience.

    The resurgence of interest in neural networks in the 1980s and 1990s, coupled with advancements in computational power, spurred significant progress in machine learning. Despite these developments, the aspiration for AGI remained elusive, with many AI systems still confined to narrow tasks. In the early 2000s, a renewed focus on AGI emerged, driven by researchers advocating for machines that could generalize across various domains. This period saw the establishment of dedicated AGI conferences and journals, reflecting a concerted effort to achieve human-like intelligence in machines.

    Advancements in machine learning, particularly deep learning, have brought us closer to realizing AGI. Deep neural networks, especially those utilizing transformer architectures, have demonstrated remarkable capabilities in natural language processing, image recognition, and game playing. Models like OpenAI's GPT series and DeepMind's AlphaGo have showcased the potential of AI systems to perform tasks that were once considered exclusive to human intelligence. These developments have sparked debates about whether current AI systems exhibit aspects of AGI or if they still fall short of true general intelligence.

    Recent research has explored various pathways to AGI, including the development of AI-generating algorithms (AI-GAs) that autonomously create more advanced AI systems. This approach leverages meta-learning and self-improvement mechanisms, allowing AI to enhance its own capabilities without human intervention. Additionally, interdisciplinary studies have examined the biological foundations of intelligence, seeking inspiration from neural processes to inform the design of AGI systems. For instance, research on the design of artificial systems based on biological intelligence has provided insights into hierarchical architectures and contextual processing, which are crucial for developing general intelligence in machines.

    The pursuit of Artificial General Intelligence stands at the confluence of technological innovation and philosophical inquiry. While significant strides have been made, the realization of AGI presents profound challenges, including ethical considerations, safety concerns, and the need for robust regulatory frameworks. As AI systems become increasingly capable, it is imperative to address these challenges proactively to ensure that the development of AGI aligns with human values and societal well-being. The journey toward AGI is not merely a technical endeavor but a collective exploration of what it means to be intelligent, conscious, and human in an age of intelligent machines.
    """

    main(test_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Total sentences: 25
Detected boundaries at sentence indices: [12, 18, 22]

--- Segmented Text ---
Segment:
Artificial General Intelligence (AGI) represents a pivotal milestone in the field of artificial intelligence, aiming to develop machines capable of performing any intellectual task that a human can. Unlike narrow AI, which excels in specific domains, AGI seeks to emulate the versatile and adaptive nature of human cognition. This ambition encompasses a broad spectrum of cognitive functions, including reasoning, problem-solving, learning, and understanding complex concepts across diverse fields. The pursuit of AGI is not merely a technological endeavor but also a profound exploration into the essence of intelligence itself, challenging our understanding of consciousness, ethics, and the future trajectory of human-machine collaboration. The journey toward AGI is deeply intertwined with the evolution of artificial intelligence. In the mid-20th century, pioneers like Alan Turing laid t