In [1]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import ast
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../../')
from utility.utility_functions import *

In [2]:
df = pd.read_parquet('../../data/avocado_train.parquet')

In [3]:
# Read and clean signatures
with open("signatures_cleaned.txt", "r") as file:
    signature_list = [ast.literal_eval(line.strip().rstrip(',')) for line in file if line.strip()  ]

for signature in tqdm(signature_list, desc="Cleaning Signatures"):
    df['text'] = df['text'].str.replace(signature, "", regex=False)

Cleaning Signatures: 100%|██████████| 2240/2240 [05:24<00:00,  6.90it/s]


In [4]:
names = [
    "barry",
    "david",
    "george",
    "donivan",
    "shailesh",
    "nilesh",
    "ritesh",
    "andersen",
    "chris",
    "vinayak",
    "sreeni",
    "silverton",
    "lisa",
    "rajeev",
    "amitabh",
    "jeff",
    "jivaro",
    "rajeev",
    "lisa",
    "rajeev",
    "nihar",
    "rajdeep",
    "amitabh",
    "sinha",
    "tom",
    "jeff",
    "jim",
    "joe","tom","bithi","jivaro","jivaroinc","marilisa","debbie",
    "michael","mike","jivaro","jivaroinc","emma","brett","darcysalzmann",
    "andy","toshi",
    "pachipala",
    "raghavan",
    "palanisamy",
    "Ravikumar",
    "Wilhan",
    "helen",
    "sri",
    "jaime",
    "darshan",
    "mehrak",
    "rishi",
    "divakar",
    "Venk",
    "Fortunata",
    "elba",
    "Carlos",
    "venktesh",
    "prakash",
    "shukla",
    "amy",
    "kimberlie",
    "wagoner",
    "andrew",
    "garcia",
    "Glenn",
    "Martinez",
    "Simmons",
    "Susan",
    "ruyben",
    "Mitch",
    "sharyn",
    "roopak",
    "krishna",
    "Prasad",
    "srik",
    "Ravi",
    "Mary",
    "Glenn",
    "Pereira",
    "Doug",
    "douga",
    "Ray",
    "rayhan",
    "Tony",
    "Arun",
    "ricardo",
    "Dan",
    "Dave",
    "Jon",
    "Ryan",
    "Steve",
    "john",
    "Richard",
    "Matt",
    "Peter",
    "Ron",
    "germana",
    "Ann",
    "Ruth",
    "Trish",
    "Kelsey",
    "Amit",
    "Nadir",
    "fernand",
    "Hung",
    "Abinov",
    "steve",
    "meshele",
    "Ko",
    "Stuart",
    "kadanoff",
    "Marcia",
    "Chen",
    "Diana",
    "craig",
    "Chapman",
    "Sanjay",
    "salzmann",
    "D'arcy",
    "Darcy",
]

pattern = '|'.join(rf'\b{name}\b' for name in names)

# Remove the specified strings from the column (case-insensitive)
df['text'] = df['text'].str.replace(pattern, '', case=False, regex=True).str.strip()

In [5]:
docs = list(set(df['text'].tolist()))
len(docs)

350583

In [6]:
# Initialize UMAP with a fixed random state
random_state = 42 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=200, prediction_data=True)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

2024-11-29 15:44:18,901 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/10956 [00:00<?, ?it/s]

2024-11-29 15:58:20,896 - BERTopic - Embedding - Completed ✓
2024-11-29 15:58:20,896 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-29 16:03:01,843 - BERTopic - Dimensionality - Completed ✓
2024-11-29 16:03:01,845 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [7]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 115/115 [00:00<00:00, 133.71it/s]


In [8]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
docs_to_remove = []

In [None]:
# 2 - jsp
# 4 - bugzilla
# 5 - extremeprogramming
# 6 - oca
# 8 - meeting
# 9 - meeting_planning?
# 9 - testing
# 17 - conference call
# 21 - presentation slides
# 25 - flight, hotel, reservation
# 26 - schedule training
# 27 - travel, confirmed
# 28 - bugs
# 29 - siebel
# 32 - epservlet, java
# 33 - emds, emas, bug, planned

In [51]:
topic_id = 20
inspect_topic(topic_model,topic_id)

Topic: 20

Top Words:
die, und, der, mit, ich, das, bei, von, für, sie, 

Representative Target Sentences: 
Golem Newsletter vom 06.09.2001
... connected by ISION Internet AG
_____


Schlagzeilen:
Microsoft verkündet WindowsCE-Nachfolger
Zwei HP Jornadas mit neuer WindowsCE-Version
Freies PowerVR SDK für Kyro und Kyro II erhältlich
E-Government für Berliner Bürgerämter
PC-Spiele werden teurer durch Euro-Umstellung
Adobe setzt auf PixelNet bei digitaler Foto-Entwicklung
Microsoft veröffentlicht "Linux Migration Guide"
ConnexModem - Kabelloses 56K-Modem von AeroComm
Infogrames und Codemasters schließen Distributionsabkommen
Handelsblatt: Kartellwächter überprüfen heute.t-online.de
SanDisk und Sony entwicklen nächste Memory-Stick-Generation
Erste Netzwerkprodukte von Dell erhältlich
Noch ein Schädling: APost verbreitet sich per E-Mail
QuoVadis: WindowsCE im Auto-Navigationssystem
Neue Version des Magistr-Wurm unterwegs
Studie: Mobiltelefone drängen Telefonkarten ins Abseits
Lara Croft Wur

In [52]:
topic_docs = inspect_topic_docs(topic_model,topic_id,docs,0)
for doc in topic_docs:
    print(repr(doc))

Number of Documents: 1340

'Werner,\ndeckt das Deine Erwartungen?\n\n\n\n-----Ursprüngliche Nachricht-----'
'> Hi ,\n>\n> Wenn ich nach FRA komme (das wird sehr oft sein mit diesem Job), dann lass\n> uns ein Abend freinehmen, und auf ein Glas Wein die Welt (ver)plaudern.\n> Sabine ist herzlich eingeladen wenn Es schon mit Babysitter klappt.\n>\n> Best to la grande famille Kempf\n>\n> René\n>\n>\n>\n>'
'Hi Rene!\n\nMarc wollte dich gestern anrufen - hast du eine neue Handy-Nr? Haettest du\nLust auf einen Umtrunk heute abend?\n\nServus,\n\nUli.\n\nGet Your Private, Free E-mail from MSN Hotmail at http://www.hotmail.com.\n\nShare information about yourself, create your own public profile at\nhttp://profiles.msn.com.'
'Werner,\nPalm: müssen wir am Montag drüber reden.\n\n-----Ursprüngliche Nachricht-----'
'Klingt doch wirklich gut .... oder ?\n\nWerner'
"Hallo Norbert,\n\nhey, das sind ja Neuigkeiten !\nDa moechte ich Dir ganz herzlich gratulieren ! Und was den Stress\nbetrifft - so jung w