In [2]:
import pandas as pd

df = pd.read_csv("emails.csv")




In [3]:

df_sample = df.sample(n=100000, random_state=42)

In [4]:
import re

def clean_first_message(raw_text):
    if not isinstance(raw_text, str):
        return ""

    # Normalize newlines
    text = raw_text.replace("\r", "")

    # 1) Strip any initial header block (Message-ID, Date, etc.)
    parts = text.split("\n\n", 1)
    body = parts[1] if len(parts) == 2 else text

    # 2) Cut at first internal header-like marker
    lower = body.lower()
    markers = ["\nfrom:", "\nto:", "\ncc:", "\nsubject:", "to:", "phone:"]

    cut_pos = len(body)
    for m in markers:
        pos = lower.find(m)
        if pos != -1:
            cut_pos = min(cut_pos, pos)

    body = body[:cut_pos]

    # 3) Remove tabs
    body = body.replace("\t", " ")

    # 4) Normalize whitespace
    body = body.replace("\n", " ")
    body = re.sub(r"\s+", " ", body)

    # 5) Strip
    body = body.strip()

    # 6) Limit to 250 words
    words = body.split()
    body = " ".join(words[:250])

    return body

In [5]:

df_sample["clean_message"] = df_sample["message"].astype(str).apply(clean_first_message)


In [6]:
df_sample.head(50)

Unnamed: 0,file,message,clean_message
427616,shackleton-s/sent/1912.,Message-ID: <21013688.1075844564560.JavaMail.e...,Bill: Thanks for the info. I also spoke with J...
108773,farmer-d/logistics/1066.,Message-ID: <22688499.1075854130303.JavaMail.e...,"Aimee, Please check meter #1591 Lamay gas lift..."
355471,parks-j/deleted_items/202.,Message-ID: <27817771.1075841359502.JavaMail.e...,GCCA Crawfish and rip-off raffle & over-priced...
457837,stokley-c/chris_stokley/iso/client_rep/41.,Message-ID: <10695160.1075858510449.JavaMail.e...,"<<Keoni.zip>> Chris, per your request here are..."
124910,germany-c/all_documents/1174.,Message-ID: <27819143.1075853689038.JavaMail.e...,I'm trying to change the Receipt Meter on deal...
403283,scott-s/_sent_mail/244.,Message-ID: <10142547.1075846737160.JavaMail.e...,What if we replace Section 2 with something li...
293966,love-p/discussion_threads/113.,Message-ID: <18212904.1075858229814.JavaMail.e...,---------------------- Forwarded by Phillip M ...
478830,taylor-m/australia_trading/8.,Message-ID: <14840674.1075860237113.JavaMail.e...,"Dear Mark, As per our discussion at the law co..."
295428,love-p/sent_items/765.,Message-ID: <22170097.1075862178026.JavaMail.e...,got your message last night. What is up? Bet y...
137822,giron-d/deleted_items/170.,Message-ID: <23520008.1075852220995.JavaMail.e...,"Hello Darron, Just wanted to let you know that..."


In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

texts = df_sample["clean_message"].astype(str).tolist()

embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  
)


print(embeddings.shape)

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

(100000, 384)


In [24]:
from sklearn.decomposition import PCA

# embeddings: (n_samples, d)
pca = PCA(n_components=50, random_state=42)
X = pca.fit_transform(embeddings)





In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

best_score = -1
best_k = None
scores = {}

for k in range(2, 15):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    score = silhouette_score(X, labels)
    scores[k] = score
    print(f"k={k}, silhouette={score:.4f}")
    if score > best_score:
        best_score = score
        best_k = k

print("Best k:", best_k, "with silhouette:", best_score)

In [27]:
import hdbscan
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=250,
    metric='euclidean'
)
labels = clusterer.fit_predict(X)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



In [28]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

df_sample["cluster"] = labels


mask = labels != -1
if mask.sum() > 1 and len(set(labels[mask])) > 1:
    sil = silhouette_score(X[mask], labels[mask])
    print("Silhouette (no noise):", sil)
else:
    print("Not enough clustered points for silhouette.")

Silhouette (no noise): 0.38841336965560913


In [29]:
import numpy as np

# Count clusters (excluding noise = -1)
unique, counts = np.unique(labels[labels != -1], return_counts=True)

# Combine into list of (cluster_id, size)
cluster_sizes = list(zip(unique, counts))

# Sort by size descending
cluster_sizes_sorted = sorted(cluster_sizes, key=lambda x: -x[1])

# Print
print("Cluster sizes (largest → smallest):")
for cid, size in cluster_sizes_sorted:
    print(f"Cluster {cid}: {size} points")

Cluster sizes (largest → smallest):
Cluster 3: 9299 points
Cluster 6: 1890 points
Cluster 1: 667 points
Cluster 2: 342 points
Cluster 0: 327 points
Cluster 5: 324 points
Cluster 4: 298 points
