In [1]:
from __future__ import annotations

from core.models import Message, MessageId, ThreadId, SpaceId
from core.stores import EmbeddingStore, MembershipStore, MessageStore, ThreadStore

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Defining the components for the processor

In [2]:
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
    filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
    local_dir="./models"  # Downloads to a 'models' folder in your current directory
)

print(f"Model downloaded to: {model_path}")

Model downloaded to: models\Llama-3.2-3B-Instruct-Q4_K_M.gguf


In [3]:
from core.strategies import ContextWindowFormatter, MiniLMEmbedder, UMAPReducer, HDBSCANClusterer, LlamaThreadLabeler, CentroidThreadRepComputer, BufferedUpdateStrategy

In [4]:
messages = MessageStore()
threads = ThreadStore()
memberships = MembershipStore()
embeddings = EmbeddingStore()

formatter = ContextWindowFormatter(window_back=2, window_fwd=1, time_threshold_minutes=10)
embedder = MiniLMEmbedder("all-MiniLM-L6-v2")
reducer = UMAPReducer(n_neighbors=15, n_components=10, min_dist=0.0, metric="cosine", random_state=42)
clusterer = HDBSCANClusterer(min_cluster_size=15, min_samples=5, metric="euclidean", cluster_selection_method="eom")
labeler = LlamaThreadLabeler(
    model_path="models/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
    n_ctx=4096,
    max_msg_chars=300
)

thread_rep = CentroidThreadRepComputer(memberships=memberships, embeddings=embeddings, msg_space="msg:full")

update_strategy = BufferedUpdateStrategy(
    embeddings=embeddings,
    threads=threads,
    memberships=memberships,
    clusterer=clusterer,
    global_min_threshold=0.65,
    percentile_threshold=25,
    buffer_size_limit=10,
    pending_size_limit=20,
    min_delta=0.08,
    anchor_size=50,
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


## Formatting raw data

In [5]:
from utils import raw2df

In [6]:
clean_df = raw2df('data/chats/#1 Nexus Chat Example.txt', 'auto')

INFO:utils:Auto-detected format: iso_sec


Successfully parsed 107 messages.


In [7]:
clean_df

Unnamed: 0,date_time,user,message
0,2026-01-14 21:00:35,group_notification,Vsevolod created chat
1,2026-01-14 21:00:36,group_notification,40329826824351 was added to chat
2,2026-01-14 21:00:45,Vsevolod,Hello everybody!
3,2026-01-14 21:00:51,Maya Rozenshtein,Hello my guy!
4,2026-01-14 21:01:01,Vsevolod,How are you?
...,...,...,...
102,2026-01-15 14:31:32,Maya Rozenshtein,Wait weren't you at a lecture while you texted?
103,2026-01-15 14:32:26,Vsevolod,He let us out early since he had a meeting
104,2026-01-15 14:32:32,Maya Rozenshtein,bruh
105,2026-01-15 14:32:39,Maya Rozenshtein,nice for you


## Populating message store

In [8]:
def load_from_df(df_clean) -> None:
    msgs = []
    for i, row in df_clean.iterrows():
        msgs.append(
            Message(
                id=f"m{i}",
                timestamp=row["date_time"].to_pydatetime() if hasattr(row["date_time"], "to_pydatetime") else row["date_time"],
                user=str(row["user"]),
                text=str(row["message"]),
            )
        )
    messages.add(msgs)

load_from_df(clean_df)

## Creating and running the main processor

In [9]:
from core.processor import ChatProcessor
processor = ChatProcessor(
    messages=messages,
    threads=threads,
    memberships=memberships,
    embeddings=embeddings,
    embedder=embedder,
    reducer=reducer,
    clusterer=clusterer,
    thread_rep_computer=thread_rep,
    update_strategy=update_strategy,
    formatter=formatter,
    labeler=labeler,
)

In [10]:
processor.run_batch()

INFO:core.processor:Pipeline: Starting pipeline (5%)
INFO:core.processor:run_batch: start
INFO:core.processor:run_batch: messages=107
INFO:core.processor:run_batch: formatting messages
INFO:core.processor:Pipeline: Formatting messages (10%)
INFO:core.processor:run_batch: formatted texts=107
INFO:core.processor:run_batch: embedding texts
INFO:core.processor:Pipeline: Embedding messages (20%)


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:core.processor:run_batch: embeddings shape=(107, 384) dtype=float32
INFO:core.processor:run_batch: stored msg embeddings space=msg:full count=107
INFO:core.processor:run_batch: reducing embeddings
INFO:core.processor:Pipeline: Reducing dimensions (60%)
  warn(
INFO:core.processor:run_batch: reduced shape=(107, 10) dtype=float32
INFO:core.processor:run_batch: stored cluster embeddings space=msg:cluster count=107
INFO:core.processor:run_batch: clustering
INFO:core.processor:Pipeline: Clustering topics (70%)
INFO:core.processor:run_batch: clustering done clusters=2 noise_msgs=0
INFO:core.processor:_labels_to_threads: start messages=107
INFO:core.processor:_labels_to_threads: created_threads=2
INFO:core.processor:_labels_to_threads: done total_memberships=107
INFO:core.processor:_labels_to_threads: generating labels for 2 threads
INFO:root:Labeler: ü§í Conversation Highlights | Sum: Maya and Vsevolod discuss a co...
INFO:root:Labeler: üéµ Song Connection Found | Sum: The conversatio

In [11]:
processor.threads.all()

[Thread(id='thread_d7686a7cb3', title='ü§í Conversation Highlights', summary='Maya and Vsevolod discuss a conversation, with Maya getting sick and Vsevolod joking about it, then Maya suggests watching Dexter and Vsevolod agrees, but gets pulled away by a lecture.', created_at=datetime.datetime(2026, 1, 28, 22, 46, 17, 939440), updated_at=datetime.datetime(2026, 1, 28, 22, 46, 35, 255237), metadata={}),
 Thread(id='thread_53e40c723c', title='üéµ Song Connection Found', summary='The conversation revolves around a song that Maya Rozenshtein is trying to identify, which she realizes is the same song that starts with "three little birds sat on a window". Vsevolod is initially unaware of the song but eventually identifies it as a band, although he doesn\'t remember the specific details. The conversation explores the song\'s connection to Japan, as Maya Rozenshtein recalls a Japanese artist or band, despite not being sure.', created_at=datetime.datetime(2026, 1, 28, 22, 46, 17, 939440), upd

In [12]:
processor.memberships

<MembershipStore with 107 memberships>

In [13]:
processor.messages

<MessageStore with 107 messages>

In [14]:
processor.messages.all()[1]

Message(id='m1', timestamp=datetime.datetime(2026, 1, 14, 21, 0, 36), user='group_notification', text='40329826824351 was added to chat ', metadata={})

### Visualisations

In [15]:
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px

# Retrieve Data
all_messages = processor.messages.all()
mids = processor.messages.ids()

# Get Embeddings
_, X_cluster = processor.embeddings.get_matrix("msg:cluster")

# Map Message IDs to Thread Info
mid_to_tid = {}
mid_to_title = {}

# Initialize defaults (Noise)
for mid in mids:
    mid_to_tid[mid] = "noise"
    mid_to_title[mid] = "Noise / Unassigned"

# Populate with active thread data
for m in processor.memberships._all:
    if m.status == 'active':
        thread = processor.threads.get(m.thread_id)

        mid_to_tid[m.message_id] = thread.id
        mid_to_title[m.message_id] = thread.title

# Construct DataFrame
df_viz = pd.DataFrame({
    'message_id': mids,
    'date_time': [m.timestamp for m in all_messages],
    'user': [m.user for m in all_messages],
    'message': [m.text for m in all_messages],
    'thread_id': [mid_to_tid[mid] for mid in mids],
    'thread_title': [mid_to_title[mid] for mid in mids]
})

# Create a unique display label
def make_unique_label(row):
    if row['thread_id'] == 'noise':
        return 'Noise'
    # Append short ID hash to ensure uniqueness in the legend
    short_id = row['thread_id'].split('_')[-1][:4]
    return f"{row['thread_title']} ({short_id})"

df_viz['legend_label'] = df_viz.apply(make_unique_label, axis=1)

print(f"Dataframe prepared with {len(df_viz)} rows.")

Dataframe prepared with 107 rows.


In [16]:
# PCA & Plotting
pca = PCA(n_components=2)
viz_pca = pca.fit_transform(X_cluster)

df_viz['x'] = viz_pca[:, 0]
df_viz['y'] = viz_pca[:, 1]

fig = px.scatter(
    df_viz,
    x='x',
    y='y',
    color='legend_label',

    # Show the clean title in the hover box
    hover_data={
        'legend_label': False,
        'thread_title': True,
        'user': True,
        'message': True,
        'x': False,
        'y': False
    },
    title="Chat Topics: Distinct Threads (Colored by Unique ID)",
    template="plotly_dark"
)

# Visual polish
fig.update_traces(marker=dict(size=6, opacity=0.8))
fig.show()

In [17]:
import plotly.express as px

# # Filter out Noise if needed
# df_timeline = df_viz[df_viz['thread_id'] != 'noise'].copy()

# Sort by Thread Title so the Y-axis is organized alphabetically (or you can sort by time)
df_timeline = df_viz.sort_values('thread_title')

# Plot
fig = px.scatter(
    df_timeline,
    x='date_time',
    y='thread_title',
    color='legend_label',
    hover_data=['user', 'message'],
    title="Timeline of Topics: When did conversations happen?",
    template="plotly_dark",
    height=800
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(showlegend=False) # Hide legend if there are too many topics
fig.show()

In [18]:
for t in processor.threads.all():
    print(f"Thread {t.id}: \nTitle: {t.title}\nSummary: {t.summary}\n")

Thread thread_d7686a7cb3: 
Title: ü§í Conversation Highlights
Summary: Maya and Vsevolod discuss a conversation, with Maya getting sick and Vsevolod joking about it, then Maya suggests watching Dexter and Vsevolod agrees, but gets pulled away by a lecture.

Thread thread_53e40c723c: 
Title: üéµ Song Connection Found
Summary: The conversation revolves around a song that Maya Rozenshtein is trying to identify, which she realizes is the same song that starts with "three little birds sat on a window". Vsevolod is initially unaware of the song but eventually identifies it as a band, although he doesn't remember the specific details. The conversation explores the song's connection to Japan, as Maya Rozenshtein recalls a Japanese artist or band, despite not being sure.



## Semantic Search

In [19]:
query = "Japan"

result = processor.semantic_search(query, top_threads = 5, top_messages_per_thread = 5, min_thread_sim = 0.1, min_msg_sim=0.25)

INFO:core.processor:semantic_search: query_len=5 top_threads=5 top_msgs=5 min_thread_sim=0.100 min_msg_sim=0.250


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:core.processor:semantic_search: scanned_threads=2 missing_centroids=0 candidates=1
INFO:core.processor:semantic_search: thread=thread_53e40c723c tscore=0.194 members=30 missing_msg_vecs=0 hits=5
INFO:core.processor:semantic_search: returning_threads=1


In [20]:
result

[{'thread_id': 'thread_53e40c723c',
  'thread_title': 'üéµ Song Connection Found',
  'thread_score': 0.19406844675540924,
  'messages': [{'message_id': 'm96',
    'score': 0.4254532754421234,
    'user': 'Maya Rozenshtein',
    'text': 'oh ',
    'timestamp': datetime.datetime(2026, 1, 15, 14, 13, 48)},
   {'message_id': 'm98',
    'score': 0.4002930521965027,
    'user': 'Maya Rozenshtein',
    'text': 'hm ',
    'timestamp': datetime.datetime(2026, 1, 15, 14, 13, 54)},
   {'message_id': 'm97',
    'score': 0.3958336412906647,
    'user': 'Maya Rozenshtein',
    'text': 'then why did i think about japan ',
    'timestamp': datetime.datetime(2026, 1, 15, 14, 13, 53)},
   {'message_id': 'm94',
    'score': 0.3917009234428406,
    'user': 'Maya Rozenshtein',
    'text': 'was it a japanese band? ',
    'timestamp': datetime.datetime(2026, 1, 15, 14, 12, 43)},
   {'message_id': 'm95',
    'score': 0.34800535440444946,
    'user': 'Vsevolod',
    'text': "No, they are from UK  The lead sin