In [1]:
from __future__ import annotations

from dataclasses import dataclass
from typing import List, Tuple
import numpy as np

from core.models import Message, MessageId, ThreadId, SpaceId
from core.stores import EmbeddingStore, MembershipStore, MessageStore, ThreadStore
from core.interfaces import Formatter, Embedder, Reducer, Clusterer, ThreadRepComputer, Assigner, UpdateStrategy, ThreadLabeler

import re
import datetime
import numpy as np
import pandas as pd

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Defining the components for the processor

In [24]:
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(
    repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
    filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
    local_dir="./models"  # Downloads to a 'models' folder in your current directory
)

print(f"Model downloaded to: {model_path}")

Model downloaded to: models\Llama-3.2-3B-Instruct-Q4_K_M.gguf


In [None]:
from core.strategies import ContextWindowFormatter, MiniLMEmbedder, UMAPReducer, HDBSCANClusterer, LlamaThreadLabeler, CentroidThreadRepComputer

In [25]:
messages = MessageStore()
threads = ThreadStore()
memberships = MembershipStore()
embeddings = EmbeddingStore()

formatter = ContextWindowFormatter(window_back=2, window_fwd=1, time_threshold_minutes=10, repeat_center=2)
embedder = MiniLMEmbedder("all-MiniLM-L6-v2")
reducer = UMAPReducer(n_neighbors=30, n_components=5, min_dist=0.0, metric="cosine", random_state=42)
clusterer = HDBSCANClusterer(min_cluster_size=30, min_samples=3, metric="euclidean", cluster_selection_method="eom")
labeler = LlamaThreadLabeler(
    model_path="models/Llama-3.2-3B-Instruct-Q4_K_M.gguf",
    n_ctx=2048,
    max_msg_chars=300
)

thread_rep = CentroidThreadRepComputer(memberships=memberships, embeddings=embeddings, msg_space="msg:full")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


## Formatting raw data

In [None]:
from utils import raw2df

In [27]:
clean_df = raw2df('data/chats/whatsapp_chat_data_test.txt', '12hr')

## Populating message store

In [28]:
def load_from_df(df_clean) -> None:
    msgs = []
    for i, row in df_clean.iterrows():
        msgs.append(
            Message(
                id=f"m{i}",
                timestamp=row["date_time"].to_pydatetime() if hasattr(row["date_time"], "to_pydatetime") else row["date_time"],
                user=str(row["user"]),
                text=str(row["message"]),
            )
        )
    messages.add(msgs)

load_from_df(clean_df)

## Creating and running the main processor

In [29]:
from core.processor import ChatProcessor

processor = ChatProcessor(
    messages=messages,
    threads=threads,
    memberships=memberships,
    embeddings=embeddings,
    embedder=embedder,
    reducer=reducer,
    clusterer=clusterer,
    thread_rep_computer=thread_rep,
    assigner=NoOpAssigner(),
    update_strategy=NoOpUpdateStrategy(),
    formatter=formatter,
    labeler=labeler,
)

In [30]:
processor.run_batch()

INFO:core.processor:run_batch: start
INFO:core.processor:run_batch: messages=13655
INFO:core.processor:run_batch: formatting messages
INFO:core.processor:run_batch: formatted texts=13655
INFO:core.processor:run_batch: embedding texts
Batches: 100%|██████████| 427/427 [00:56<00:00,  7.54it/s]
INFO:core.processor:run_batch: embeddings shape=(13655, 384) dtype=float32
INFO:core.processor:run_batch: stored msg embeddings space=msg:full count=13655
INFO:core.processor:run_batch: reducing embeddings

n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

INFO:core.processor:run_batch: reduced shape=(13655, 5) dtype=float32
INFO:core.processor:run_batch: stored cluster embeddings space=msg:cluster count=13655
INFO:core.processor:run_batch: clustering
INFO:core.processor:run_batch: clustering done clusters=96 noise_msgs=4553
INFO:core.processor:_labels_to_threads: start messages=13655
INFO:core.processor:_labels_to_threads: created_threads=96
INFO:core.processor:

DEBUG: Raw LLM Output: Title: New Members Joined
Summary: Ten new users j... -> Parsed: New Members Joined


Labeling Threads:   2%|▏         | 2/96 [00:03<02:25,  1.55s/thread]

DEBUG: Raw LLM Output: Title: New Members Joined
Summary: Multiple indivi... -> Parsed: New Members Joined


Labeling Threads:   3%|▎         | 3/96 [00:03<01:55,  1.24s/thread]

DEBUG: Raw LLM Output: Title: Help Desk Chat
Summary: Technical support c... -> Parsed: Help Desk Chat


Labeling Threads:   4%|▍         | 4/96 [00:04<01:46,  1.15s/thread]

DEBUG: Raw LLM Output: Title: Unidentified Voice
Summary: Unidentified vo... -> Parsed: Unidentified Voice


Labeling Threads:   5%|▌         | 5/96 [00:05<01:35,  1.05s/thread]

DEBUG: Raw LLM Output: Title: Deleted Conversation
Summary: No meaningful... -> Parsed: Deleted Conversation


Labeling Threads:   6%|▋         | 6/96 [00:06<01:29,  1.00thread/s]

DEBUG: Raw LLM Output: Title: Deleted Conversation
Summary: A conversatio... -> Parsed: Deleted Conversation


Labeling Threads:   7%|▋         | 7/96 [00:07<01:23,  1.06thread/s]

DEBUG: Raw LLM Output: Title: Laughing Good Times
Summary: Two people exc... -> Parsed: Laughing Good Times


Labeling Threads:   8%|▊         | 8/96 [00:09<01:40,  1.15s/thread]

DEBUG: Raw LLM Output: Title: Women Empowerment Discussion
Summary: A con... -> Parsed: Women Empowerment Discussion


Labeling Threads:   9%|▉         | 9/96 [00:10<01:52,  1.29s/thread]

DEBUG: Raw LLM Output: Title: TSEC CS Group Join
Summary: Various TSEC CS... -> Parsed: TSEC CS Group Join


Labeling Threads:  10%|█         | 10/96 [00:12<01:58,  1.37s/thread]

DEBUG: Raw LLM Output: Title: TSEC Team Members
Summary: TSEC team member... -> Parsed: TSEC Team Members


Labeling Threads:  11%|█▏        | 11/96 [00:13<01:47,  1.26s/thread]

DEBUG: Raw LLM Output: Title: Django Project Showcase
Summary: Django ent... -> Parsed: Django Project Showcase


Labeling Threads:  12%|█▎        | 12/96 [00:14<01:50,  1.31s/thread]

DEBUG: Raw LLM Output: Title: Funny Video Comments
Summary: Viewers react... -> Parsed: Funny Video Comments


Labeling Threads:  14%|█▎        | 13/96 [00:16<01:54,  1.38s/thread]

DEBUG: Raw LLM Output: Title: Amazon's App Development Platform
Summary: ... -> Parsed: Amazon's App Development Platform


Labeling Threads:  15%|█▍        | 14/96 [00:17<01:44,  1.28s/thread]

DEBUG: Raw LLM Output: Title: Troubleshooting Git Bash
Summary: Troublesh... -> Parsed: Troubleshooting Git Bash


Labeling Threads:  16%|█▌        | 15/96 [00:18<01:45,  1.30s/thread]

DEBUG: Raw LLM Output: Title: Homebrew Git Installation
Summary: Homebrew... -> Parsed: Homebrew Git Installation


Labeling Threads:  17%|█▋        | 16/96 [00:19<01:36,  1.21s/thread]

DEBUG: Raw LLM Output: Title: Unidentified Voice
Summary: A mysterious vo... -> Parsed: Unidentified Voice


Labeling Threads:  18%|█▊        | 17/96 [00:20<01:38,  1.24s/thread]

DEBUG: Raw LLM Output: Title: Getting Started with GitHub
Summary: Partic... -> Parsed: Getting Started with GitHub


Labeling Threads:  19%|█▉        | 18/96 [00:22<01:32,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Firebase Project Setup
Summary: Troubleshoo... -> Parsed: Firebase Project Setup


Labeling Threads:  20%|█▉        | 19/96 [00:24<02:11,  1.70s/thread]

DEBUG: Raw LLM Output: Title: Prime Factorization
Summary: Solve prime fa... -> Parsed: Prime Factorization


Labeling Threads:  21%|██        | 20/96 [00:26<01:56,  1.54s/thread]

DEBUG: Raw LLM Output: Title: Learning to Code
Summary: Discussion on the... -> Parsed: Learning to Code


Labeling Threads:  22%|██▏       | 21/96 [00:27<01:41,  1.36s/thread]

DEBUG: Raw LLM Output: Title: Emotional Support Conversation
Summary: A c... -> Parsed: Emotional Support Conversation


Labeling Threads:  23%|██▎       | 22/96 [00:28<01:39,  1.34s/thread]

DEBUG: Raw LLM Output: Title: Scanner Input Error
Summary: Scanner input ... -> Parsed: Scanner Input Error


Labeling Threads:  24%|██▍       | 23/96 [00:29<01:26,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Emoji Conversation
Summary: A conversation ... -> Parsed: Emoji Conversation


Labeling Threads:  25%|██▌       | 24/96 [00:30<01:23,  1.15s/thread]

DEBUG: Raw LLM Output: Title: Appreciation Conversation
Summary: People e... -> Parsed: Appreciation Conversation


Labeling Threads:  26%|██▌       | 25/96 [00:31<01:27,  1.24s/thread]

DEBUG: Raw LLM Output: Title: Factorial Calculation
Summary: Converting l... -> Parsed: Factorial Calculation


Labeling Threads:  27%|██▋       | 26/96 [00:33<01:30,  1.30s/thread]

DEBUG: Raw LLM Output: Title: Programming Language Debate
Summary: Progra... -> Parsed: Programming Language Debate


Labeling Threads:  28%|██▊       | 27/96 [00:34<01:23,  1.21s/thread]

DEBUG: Raw LLM Output: Title: Learning Python Basics
Summary: Two program... -> Parsed: Learning Python Basics


Labeling Threads:  29%|██▉       | 28/96 [00:35<01:20,  1.19s/thread]

DEBUG: Raw LLM Output: Title: CSS Conundrum
Summary: A developer grapples... -> Parsed: CSS Conundrum


Labeling Threads:  30%|███       | 29/96 [00:36<01:24,  1.26s/thread]

DEBUG: Raw LLM Output: Title: Simplifying Big O Notation
Summary: Student... -> Parsed: Simplifying Big O Notation


Labeling Threads:  31%|███▏      | 30/96 [00:37<01:18,  1.18s/thread]

DEBUG: Raw LLM Output: Title: Sorting Algorithms Discussion
Summary: Deve... -> Parsed: Sorting Algorithms Discussion


Labeling Threads:  32%|███▏      | 31/96 [00:38<01:14,  1.15s/thread]

DEBUG: Raw LLM Output: Title: C++ Math Library
Summary: Discussion about ... -> Parsed: C++ Math Library


Labeling Threads:  33%|███▎      | 32/96 [00:39<01:12,  1.13s/thread]

DEBUG: Raw LLM Output: Title: C Programming Basics
Summary: Discussion on... -> Parsed: C Programming Basics


Labeling Threads:  34%|███▍      | 33/96 [00:40<01:06,  1.06s/thread]

DEBUG: Raw LLM Output: Title: Funny Conversation
Summary: Two people have... -> Parsed: Funny Conversation


Labeling Threads:  35%|███▌      | 34/96 [00:41<01:02,  1.01s/thread]

DEBUG: Raw LLM Output: Title: Funny Cat Videos
Summary: Two friends share... -> Parsed: Funny Cat Videos


Labeling Threads:  36%|███▋      | 35/96 [00:42<01:00,  1.00thread/s]

DEBUG: Raw LLM Output: Title: Unrelated Conversation
Summary: A conversat... -> Parsed: Unrelated Conversation


Labeling Threads:  38%|███▊      | 36/96 [00:43<01:05,  1.10s/thread]

DEBUG: Raw LLM Output: Title: Loop Issue in Code
Summary: Troubleshooting... -> Parsed: Loop Issue in Code


Labeling Threads:  39%|███▊      | 37/96 [00:45<01:10,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Unidentified Voice
Summary: A person respon... -> Parsed: Unidentified Voice


Labeling Threads:  40%|███▉      | 38/96 [00:46<01:04,  1.11s/thread]

DEBUG: Raw LLM Output: Title: Funny Gaming Chat
Summary: Friends discuss ... -> Parsed: Funny Gaming Chat


Labeling Threads:  41%|████      | 39/96 [00:47<01:09,  1.23s/thread]

DEBUG: Raw LLM Output: Title: Android Development Struggles
Summary: Andr... -> Parsed: Android Development Struggles


Labeling Threads:  42%|████▏     | 40/96 [00:48<01:04,  1.15s/thread]

DEBUG: Raw LLM Output: Title: Funny Conversation
Summary: A humorous conv... -> Parsed: Funny Conversation


Labeling Threads:  43%|████▎     | 41/96 [00:49<01:02,  1.14s/thread]

DEBUG: Raw LLM Output: Title: Funny Sibling Jokes
Summary: Two siblings e... -> Parsed: Funny Sibling Jokes


Labeling Threads:  44%|████▍     | 42/96 [00:50<01:01,  1.14s/thread]

DEBUG: Raw LLM Output: Title: Confused but Excited
Summary: Two friends d... -> Parsed: Confused but Excited


Labeling Threads:  45%|████▍     | 43/96 [00:51<00:57,  1.08s/thread]

DEBUG: Raw LLM Output: Title: Ty and Thnx
Summary: A brief conversation b... -> Parsed: Ty and Thnx


Labeling Threads:  46%|████▌     | 44/96 [00:53<01:03,  1.23s/thread]

DEBUG: Raw LLM Output: Title: Mac vs Windows
Summary: Discussion about Ma... -> Parsed: Mac vs Windows


Labeling Threads:  47%|████▋     | 45/96 [00:54<01:00,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Windows Users Unite
Summary: Windows users ... -> Parsed: Windows Users Unite


Labeling Threads:  48%|████▊     | 46/96 [00:55<00:59,  1.20s/thread]

DEBUG: Raw LLM Output: Title: Tech Support Chat
Summary: Tech enthusiasts... -> Parsed: Tech Support Chat


Labeling Threads:  49%|████▉     | 47/96 [00:57<01:02,  1.28s/thread]

DEBUG: Raw LLM Output: Title: Overwhelmingly Positive Response
Summary: T... -> Parsed: Overwhelmingly Positive Response


Labeling Threads:  50%|█████     | 48/96 [00:58<00:57,  1.20s/thread]

DEBUG: Raw LLM Output: Title: Appreciation for Event
Summary: Organiser r... -> Parsed: Appreciation for Event


Labeling Threads:  51%|█████     | 49/96 [00:59<00:56,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Laughter and Appreciation
Summary: A conver... -> Parsed: Laughter and Appreciation


Labeling Threads:  52%|█████▏    | 50/96 [01:01<01:05,  1.42s/thread]

DEBUG: Raw LLM Output: Title: Samsung vs Android
Summary: Users discuss t... -> Parsed: Samsung vs Android


Labeling Threads:  53%|█████▎    | 51/96 [01:02<00:54,  1.20s/thread]

DEBUG: Raw LLM Output: Title: Agreement
Summary: Two people agree on some... -> Parsed: Agreement


Labeling Threads:  54%|█████▍    | 52/96 [01:03<00:49,  1.12s/thread]

DEBUG: Raw LLM Output: Title: Laughing and Appreciation
Summary: Two peop... -> Parsed: Laughing and Appreciation


Labeling Threads:  55%|█████▌    | 53/96 [01:04<00:52,  1.21s/thread]

DEBUG: Raw LLM Output: Title: Unlocking Android Phones
Summary: Unlocking... -> Parsed: Unlocking Android Phones


Labeling Threads:  56%|█████▋    | 54/96 [01:05<00:54,  1.30s/thread]

DEBUG: Raw LLM Output: Title: Android ROM Discussion
Summary: Android ent... -> Parsed: Android ROM Discussion


Labeling Threads:  57%|█████▋    | 55/96 [01:07<00:49,  1.22s/thread]

DEBUG: Raw LLM Output: Title: Google Code Jam
Summary: Contest participan... -> Parsed: Google Code Jam


Labeling Threads:  58%|█████▊    | 56/96 [01:08<00:47,  1.18s/thread]

DEBUG: Raw LLM Output: Title: Tech Discussion
Summary: Tech enthusiasts d... -> Parsed: Tech Discussion


Labeling Threads:  59%|█████▉    | 57/96 [01:09<00:46,  1.20s/thread]

DEBUG: Raw LLM Output: Title: Graphics Programming Basics
Summary: Discus... -> Parsed: Graphics Programming Basics


Labeling Threads:  60%|██████    | 58/96 [01:10<00:47,  1.24s/thread]

DEBUG: Raw LLM Output: Title: Java Constructor Discussion
Summary: Implic... -> Parsed: Java Constructor Discussion


Labeling Threads:  61%|██████▏   | 59/96 [01:11<00:44,  1.21s/thread]

DEBUG: Raw LLM Output: Title: Java String Pool
Summary: Discussion on Jav... -> Parsed: Java String Pool


Labeling Threads:  62%|██████▎   | 60/96 [01:13<00:48,  1.35s/thread]

DEBUG: Raw LLM Output: Title: Runtime Error Discussion
Summary: Troublesh... -> Parsed: Runtime Error Discussion


Labeling Threads:  64%|██████▎   | 61/96 [01:15<00:49,  1.42s/thread]

DEBUG: Raw LLM Output: Title: Student Teachers' Opinions
Summary: Student... -> Parsed: Student Teachers' Opinions


Labeling Threads:  65%|██████▍   | 62/96 [01:16<00:45,  1.35s/thread]

DEBUG: Raw LLM Output: Title: Google Search
Summary: Participants discuss... -> Parsed: Google Search


Labeling Threads:  66%|██████▌   | 63/96 [01:17<00:40,  1.24s/thread]

DEBUG: Raw LLM Output: Title: Registration Inquiry
Summary: Person inquir... -> Parsed: Registration Inquiry


Labeling Threads:  67%|██████▋   | 64/96 [01:18<00:38,  1.20s/thread]

DEBUG: Raw LLM Output: Title: Coding Conversation
Summary: Two friends di... -> Parsed: Coding Conversation


Labeling Threads:  68%|██████▊   | 65/96 [01:20<00:43,  1.39s/thread]

DEBUG: Raw LLM Output: Title: Comparison of Audio Filters
Summary: Resear... -> Parsed: Comparison of Audio Filters


Labeling Threads:  69%|██████▉   | 66/96 [01:21<00:39,  1.31s/thread]

DEBUG: Raw LLM Output: Title: No Problem Found
Summary: A series of confi... -> Parsed: No Problem Found


Labeling Threads:  70%|██████▉   | 67/96 [01:22<00:39,  1.35s/thread]

DEBUG: Raw LLM Output: Title: Byte Data Type Confusion
Summary: C++ devel... -> Parsed: Byte Data Type Confusion


Labeling Threads:  71%|███████   | 68/96 [01:23<00:31,  1.11s/thread]

DEBUG: Raw LLM Output: Title: Response
Summary: [Summary]... -> Parsed: Response


Labeling Threads:  72%|███████▏  | 69/96 [01:24<00:30,  1.14s/thread]

DEBUG: Raw LLM Output: Title: Coding Conundrum
Summary: Developers discus... -> Parsed: Coding Conundrum


Labeling Threads:  73%|███████▎  | 70/96 [01:25<00:30,  1.16s/thread]

DEBUG: Raw LLM Output: Title: Funny Emoji Conversation
Summary: Two peopl... -> Parsed: Funny Emoji Conversation


Labeling Threads:  74%|███████▍  | 71/96 [01:26<00:25,  1.03s/thread]

DEBUG: Raw LLM Output: Title: Try Again
Summary: User requests retry or r... -> Parsed: Try Again


Labeling Threads:  75%|███████▌  | 72/96 [01:27<00:26,  1.11s/thread]

DEBUG: Raw LLM Output: Title: Google Cloud Credits
Summary: Participants ... -> Parsed: Google Cloud Credits


Labeling Threads:  76%|███████▌  | 73/96 [01:28<00:25,  1.09s/thread]

DEBUG: Raw LLM Output: Title: Coding Discussion
Summary: Group conversati... -> Parsed: Coding Discussion


Labeling Threads:  77%|███████▋  | 74/96 [01:29<00:24,  1.10s/thread]

DEBUG: Raw LLM Output: Title: Friendship Chat
Summary: Friends discussing... -> Parsed: Friendship Chat


Labeling Threads:  78%|███████▊  | 75/96 [01:31<00:23,  1.14s/thread]

DEBUG: Raw LLM Output: Title: Runtime Error Discussion
Summary: Troublesh... -> Parsed: Runtime Error Discussion


Labeling Threads:  79%|███████▉  | 76/96 [01:32<00:22,  1.10s/thread]

DEBUG: Raw LLM Output: Title: Java Local Variables
Summary: Discussion on... -> Parsed: Java Local Variables


Labeling Threads:  80%|████████  | 77/96 [01:32<00:19,  1.01s/thread]

DEBUG: Raw LLM Output: Title: Quick Hello Exchange
Summary: Brief exchang... -> Parsed: Quick Hello Exchange


Labeling Threads:  81%|████████▏ | 78/96 [01:34<00:21,  1.17s/thread]

DEBUG: Raw LLM Output: Title: Laughing and Agreeing
Summary: A conversati... -> Parsed: Laughing and Agreeing


Labeling Threads:  82%|████████▏ | 79/96 [01:35<00:18,  1.10s/thread]

DEBUG: Raw LLM Output: Title: Competitive Coding
Summary: Indian coders d... -> Parsed: Competitive Coding


Labeling Threads:  83%|████████▎ | 80/96 [01:36<00:17,  1.10s/thread]

DEBUG: Raw LLM Output: Title: Good Morning Conversation
Summary: A casual... -> Parsed: Good Morning Conversation


Labeling Threads:  84%|████████▍ | 81/96 [01:37<00:17,  1.16s/thread]

DEBUG: Raw LLM Output: Title: Java Printing Pattern
Summary: Java develop... -> Parsed: Java Printing Pattern


Labeling Threads:  85%|████████▌ | 82/96 [01:38<00:15,  1.09s/thread]

DEBUG: Raw LLM Output: Title: Python Coding Discussion
Summary: Participa... -> Parsed: Python Coding Discussion


Labeling Threads:  86%|████████▋ | 83/96 [01:39<00:14,  1.12s/thread]

DEBUG: Raw LLM Output: Title: Relatable Morning Motivation
Summary: A con... -> Parsed: Relatable Morning Motivation


Labeling Threads:  88%|████████▊ | 84/96 [01:41<00:15,  1.31s/thread]

DEBUG: Raw LLM Output: Title: Unintentional Humor Exchange
Summary: Two i... -> Parsed: Unintentional Humor Exchange


Labeling Threads:  89%|████████▊ | 85/96 [01:42<00:13,  1.23s/thread]

DEBUG: Raw LLM Output: Title: Practice for Competitive Coding
Summary: FE... -> Parsed: Practice for Competitive Coding


Labeling Threads:  90%|████████▉ | 86/96 [01:43<00:11,  1.11s/thread]

DEBUG: Raw LLM Output: Title: Practice Session
Summary: Students discuss ... -> Parsed: Practice Session


Labeling Threads:  91%|█████████ | 87/96 [01:44<00:10,  1.19s/thread]

DEBUG: Raw LLM Output: Title: Friendship and Laughter
Summary: Two friend... -> Parsed: Friendship and Laughter


Labeling Threads:  92%|█████████▏| 88/96 [01:46<00:09,  1.23s/thread]

DEBUG: Raw LLM Output: Title: IT Support Chat
Summary: A humorous convers... -> Parsed: IT Support Chat


Labeling Threads:  93%|█████████▎| 89/96 [01:47<00:09,  1.34s/thread]

DEBUG: Raw LLM Output: Title: IT Project Discussion
Summary: IT professio... -> Parsed: IT Project Discussion


Labeling Threads:  94%|█████████▍| 90/96 [01:49<00:08,  1.39s/thread]

DEBUG: Raw LLM Output: Title: Twitter Conversation
Summary: A humorous Tw... -> Parsed: Twitter Conversation


Labeling Threads:  95%|█████████▍| 91/96 [01:50<00:07,  1.42s/thread]

DEBUG: Raw LLM Output: Title: Web Development Conversation
Summary: A con... -> Parsed: Web Development Conversation


Labeling Threads:  96%|█████████▌| 92/96 [01:51<00:05,  1.28s/thread]

DEBUG: Raw LLM Output: Title: Social Media Chat
Summary: Friends discussi... -> Parsed: Social Media Chat


Labeling Threads:  97%|█████████▋| 93/96 [01:52<00:03,  1.22s/thread]

DEBUG: Raw LLM Output: Title: Funny Conversation
Summary: A humorous conv... -> Parsed: Funny Conversation


Labeling Threads:  98%|█████████▊| 94/96 [01:54<00:02,  1.46s/thread]

DEBUG: Raw LLM Output: Title: Group Welcome and Appreciation
Summary: A g... -> Parsed: Group Welcome and Appreciation


Labeling Threads:  99%|█████████▉| 95/96 [01:56<00:01,  1.39s/thread]

DEBUG: Raw LLM Output: Title: Same Same Same
Summary: Friends joking abou... -> Parsed: Same Same Same


Labeling Threads: 100%|██████████| 96/96 [01:57<00:00,  1.22s/thread]
INFO:core.processor:_labels_to_threads: labeling complete
INFO:core.processor:run_batch: threads after labels=96 memberships_total=9102
INFO:core.processor:run_batch: computing thread reps for tids=96
INFO:core.processor:run_batch: thread reps shape=(96, 384) dtype=float32
INFO:core.processor:run_batch: stored thread reps space=thread:centroid count=96
INFO:core.processor:run_batch: done


DEBUG: Raw LLM Output: Title: No Issues Found
Summary: A conversation wit... -> Parsed: No Issues Found


In [33]:
processor.threads.all()

[Thread(id='thread_9c2a5232ad', title='New Members Joined', summary='Ten new users joined the group.', created_at=datetime.datetime(2026, 1, 9, 12, 5, 20, 28910), updated_at=datetime.datetime(2020, 7, 18, 17, 35), metadata={}),
 Thread(id='thread_8f5859431f', title='New Members Joined', summary='Multiple individuals have joined the group using the invite link.', created_at=datetime.datetime(2026, 1, 9, 12, 5, 20, 28922), updated_at=datetime.datetime(2020, 4, 7, 20, 0), metadata={}),
 Thread(id='thread_ea775b95ce', title='Help Desk Chat', summary='Technical support conversation.', created_at=datetime.datetime(2026, 1, 9, 12, 5, 20, 28928), updated_at=datetime.datetime(2020, 9, 30, 10, 58), metadata={}),
 Thread(id='thread_80918f9564', title='Unidentified Voice', summary='Unidentified voice calls a stranger.', created_at=datetime.datetime(2026, 1, 9, 12, 5, 20, 28933), updated_at=datetime.datetime(2020, 10, 1, 10, 42), metadata={}),
 Thread(id='thread_58e2b692c2', title='Deleted Conversa

In [34]:
processor.memberships

<MembershipStore with 9102 memberships>

In [35]:
processor.messages

<MessageStore with 13655 messages>

In [36]:
import psutil, os
p = psutil.Process(os.getpid())
print("RSS GB:", p.memory_info().rss / 1e9)

RSS GB: 5.181054976


In [37]:
processor.messages.all()[1]

Message(id='m1', timestamp=datetime.datetime(2020, 1, 24, 20, 25), user='group_notification', text='Tanay Kamath (TSEC, CS) created group "CODERS👨\u200d💻👩\u200d💻🖥💻" ', metadata={})

### Visualisations

In [38]:
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px

# 1. Retrieve Data
all_messages = processor.messages.all()
mids = processor.messages.ids()

# 2. Get Embeddings
_, X_cluster = processor.embeddings.get_matrix("msg:cluster")

# 3. Map Message IDs to Thread Info
mid_to_tid = {}
mid_to_title = {}

# Initialize defaults (Noise)
for mid in mids:
    mid_to_tid[mid] = "noise"
    mid_to_title[mid] = "Noise / Unassigned"

# Populate with active thread data
for m in processor.memberships._all:
    if m.status == 'active':
        thread = processor.threads.get(m.thread_id)

        mid_to_tid[m.message_id] = thread.id
        mid_to_title[m.message_id] = thread.title

# Construct DataFrame
df_viz = pd.DataFrame({
    'message_id': mids,
    'date_time': [m.timestamp for m in all_messages],
    'user': [m.user for m in all_messages],
    'message': [m.text for m in all_messages],
    'thread_id': [mid_to_tid[mid] for mid in mids],
    'thread_title': [mid_to_title[mid] for mid in mids]
})

# Create a "Unique Display Label"
# This ensures "Discussion (#123)" and "Discussion (#456)" get different colors
def make_unique_label(row):
    if row['thread_id'] == 'noise':
        return 'Noise'
    # Append short ID hash to ensure uniqueness in the legend
    short_id = row['thread_id'].split('_')[-1][:4]
    return f"{row['thread_title']} ({short_id})"

df_viz['legend_label'] = df_viz.apply(make_unique_label, axis=1)

print(f"Dataframe prepared with {len(df_viz)} rows.")

Dataframe prepared with 13655 rows.


In [39]:
# PCA & Plotting
pca = PCA(n_components=2)
viz_pca = pca.fit_transform(X_cluster)

df_viz['x'] = viz_pca[:, 0]
df_viz['y'] = viz_pca[:, 1]

fig = px.scatter(
    df_viz,
    x='x',
    y='y',
    color='legend_label',

    # Show the clean title in the hover box
    hover_data={
        'legend_label': False,
        'thread_title': True,
        'user': True,
        'message': True,
        'x': False,
        'y': False
    },
    title="Chat Topics: Distinct Threads (Colored by Unique ID)",
    template="plotly_dark"
)

# Visual polish
fig.update_traces(marker=dict(size=6, opacity=0.8))
fig.show()

In [41]:
import plotly.express as px

# # Filter out Noise
# df_timeline = df_viz[df_viz['thread_id'] != 'noise'].copy()

# Sort by Thread Title so the Y-axis is organized alphabetically (or you can sort by time)
df_timeline = df_viz.sort_values('thread_title')

# Plot
fig = px.scatter(
    df_timeline,
    x='date_time',
    y='thread_title',
    color='legend_label',
    hover_data=['user', 'message'],
    title="Timeline of Topics: When did conversations happen?",
    template="plotly_dark",
    height=800
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(showlegend=False) # Hide legend if there are too many topics
fig.show()