In [1]:
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import pandas as pd
import logging
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
import re
import sys
sys.path.append('../')
from utility.utility_functions import *

In [2]:
df = pd.read_parquet('../data/avocado_preprocessing_wip/avocado_parser_25_le_1000.parquet', engine='pyarrow')

In [3]:
#df['extracted_text'] = df['extracted_text'].str.replace(r'\r\n', ' ', regex=True)
docs = list(set(df['text'].tolist()))#[:10000]

In [4]:
len(docs)

505150

In [5]:
# Initialize UMAP with a fixed random state
random_state = 42 # Other seeds sometimes caused negative values in the distanace matrix

# Set global random seeds
np.random.seed(random_state)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_state)
hdbscan_model = HDBSCAN(min_cluster_size=100, prediction_data=True,)

topic_model = BERTopic(verbose=True, umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs)

2024-11-30 12:38:20,754 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/15786 [00:00<?, ?it/s]

2024-11-30 12:54:41,622 - BERTopic - Embedding - Completed ✓
2024-11-30 12:54:41,623 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-30 13:02:00,142 - BERTopic - Dimensionality - Completed ✓
2024-11-30 13:02:00,149 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [6]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 458/458 [00:03<00:00, 150.79it/s]


In [7]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

### Remove bad entries

In [None]:
docs_to_remove = []

In [8]:
topic_nr = 161
inspect_topic(topic_model,topic_nr)

Topic: 161

Top Words:
virus, norton, antivirus, worm, viruses, mcafee, liveupdate, infected, scan, attachment, 

Representative Target Sentences: 
The attachment has a virus.
Yes, it was a virus!
That's a virus!!!!


In [9]:
topic_docs = inspect_topic_docs(topic_model, topic_nr, docs,)

Number of Documents: 320

Freier WAP Browser für Win 9x: http://www.numeric.de/m3gate/
Freier Virenscanner für Win 9x NT/WS: http://antivirus.cai.com/

Immer als Privatperson anmelden!
If you have Norton antivirus running, it will detect the virus and will not
let you open the attachment.

Joe Hong
AvocadoIT Canada Corp.
http://www.avocadoit.com

Email: joe.hong@avocadoit.com
Phone: 416-643-4864
Mobile: 416-346-2729
_________________________________________________________________
Click to add my contact info to your organizer:
http://my.infotriever.com/jhong
_____

Protect your PC - Click here for McAfee.com VirusScan Online
Proclaimed Dossy from the mountaintops:

> Most anti-viral filters in mailservers don't actually look for
> dangerous payloads.  They simply do string-compares against the
> subject lines used.  (Which is why, of course, good email-borne
> viruses should be slightly polymorphic, and randomly change the
> subject ...)

This has eerie parallels with how biological v

In [None]:
for doc in topic_docs:
    print(repr(doc))

In [None]:
docs_to_remove += topic_docs
docs_to_remove = list(set(docs_to_remove))

In [None]:
len(docs_to_remove)

In [None]:
docs_to_remove += topic_docs
docs_to_remove = list(set(docs_to_remove))
df_docs_to_remove = pd.Series(docs_to_remove, name='text').to_frame()
df_docs_to_remove.to_parquet('./docs_to_remove/docs_to_remove_7.parquet', engine='pyarrow')


### Save Progress

In [None]:
topic_docs = inspect_topic_docs(topic_model, 402,docs)

In [None]:
data = []

# Assuming topic_model.get_topics() returns a list or similar iterable of topics
for i in range(len(topic_model.get_topics())):
    # Get the documents for the current topic
    topic_docs = inspect_topic_docs(topic_model, i, docs)
    
    # Add each document to the list as a dictionary with the key "text"
    for doc in topic_docs:
        data.append({"text": doc})

# Convert the list of dictionaries to a DataFrame
df_new = pd.DataFrame(data)
df_new.to_parquet('../data/processed/05_avocado_cleaned_clustered_wip.parquet', engine='pyarrow')

In [None]:
topic_name = "_".join([word[0] for word in topic_model.get_topic(len(topic_model.get_topics())-2)])
topic_name

In [None]:
#msdirect