In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)

In [2]:
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.io as pio
pio.renderers.default='iframe'

In [3]:
from bertopic import BERTopic

In [4]:
from bertopic.vectorizers import ClassTfidfTransformer

In [6]:
import os
target_directory = os.path.expanduser("~/Downloads/bertopic_research/bertopic_test_data_models/Dickens")
os.chdir(target_directory)

In [7]:
import hdbscan
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import bipartite
from networkx.readwrite import graphml


In [8]:
expectations_df = pd.read_csv('./expectations_chunked.csv')
expectations_df.head()

Unnamed: 0.1,Unnamed: 0,text
0,expectations_00001.txt,great expectations 1867 edition by charles dic...
1,expectations_00002.txt,explicit than pip so i called myself pip and c...
2,expectations_00003.txt,likeness of either of them for their days were...
3,expectations_00004.txt,with curly black hair from the character and t...
4,expectations_00005.txt,beside their grave and were sacred to the memo...


In [10]:
docs = expectations_df['text']
docs.head()

0    great expectations 1867 edition by charles dic...
1    explicit than pip so i called myself pip and c...
2    likeness of either of them for their days were...
3    with curly black hair from the character and t...
4    beside their grave and were sacred to the memo...
Name: text, dtype: object

In [11]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [13]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
expectations_topic_model = BERTopic(language='english', calculate_probabilities=True, embedding_model="all-MiniLM-L6-v2", top_n_words=10, min_topic_size=5, ctfidf_model=ctfidf_model, verbose=True)
topics, probs = expectations_topic_model.fit_transform(docs, embeddings)

Batches:   0%|          | 0/116 [00:00<?, ?it/s]

2024-08-26 12:27:57,257 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-08-26 12:28:07,523 - BERTopic - Dimensionality - Completed ✓
2024-08-26 12:28:07,524 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-26 12:28:08,180 - BERTopic - Cluster - Completed ✓
2024-08-26 12:28:08,184 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-26 12:28:08,272 - BERTopic - Representation - Completed ✓


In [14]:
expectations_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1318,-1_man_could_up_myself,"[man, could, up, myself, one, over, which, you...",[i do not mean what i say you would never marr...
1,0,271,0_joe_chap_pip_sister,"[joe, chap, pip, sister, joes, poker, old, tic...",[id foller and joe i am very glad you did so t...
2,1,208,1_herbert_herberts_clara_handel,"[herbert, herberts, clara, handel, provis, fat...",[and clean mrs whimple said herbert when i tol...
3,2,143,2_estella_estellas_havisham_beautiful,"[estella, estellas, havisham, beautiful, richm...",[much said estella looking at me less coarse a...
4,3,110,3_mouth_slouching_dressed_pipe,"[mouth, slouching, dressed, pipe, tobacco, cas...",[see and there were some odd objects about tha...
5,4,100,4_biddy_sewing_prices_manage,"[biddy, sewing, prices, manage, catalogue, tea...",[said snappishly biddy what do you mean biddy ...
6,5,87,5_convict_prison_convicts_murder,"[convict, prison, convicts, murder, prisoner, ...",[once out of this court ill smash that face of...
7,6,73,6_havisham_miss_play_havishams,"[havisham, miss, play, havishams, susceptibili...",[home now you shall go soon said miss havisham...
8,7,72,7_jaggers_jaggerth_south_mithter,"[jaggers, jaggerth, south, mithter, mr, jagger...",[brother to habraham latharuth whos he said mr...
9,8,64,8_pocket_flopson_baby_mrs,"[pocket, flopson, baby, mrs, jane, millers, ch...",[was handing it to mrs pocket when she too wen...


In [15]:
expectations_topic_model.save('./expectations_model2')



In [16]:
expectations_topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,great expectations 1867 edition by charles dic...,9,9_pip_lookee_due_noble,"[pip, lookee, due, noble, ive, tremendous, gen...",[in the same tone as before that dont go first...,pip - lookee - due - noble - ive - tremendous ...,0.128511,True
1,explicit than pip so i called myself pip and c...,9,9_pip_lookee_due_noble,"[pip, lookee, due, noble, ive, tremendous, gen...",[in the same tone as before that dont go first...,pip - lookee - due - noble - ive - tremendous ...,0.049019,False
2,likeness of either of them for their days were...,3,3_mouth_slouching_dressed_pipe,"[mouth, slouching, dressed, pipe, tobacco, cas...",[see and there were some odd objects about tha...,mouth - slouching - dressed - pipe - tobacco -...,0.288334,False
3,with curly black hair from the character and t...,13,13_dress_bridal_white_withered,"[dress, bridal, white, withered, yellow, figur...",[which ought to be white had been white long a...,dress - bridal - white - withered - yellow - f...,0.024128,False
4,beside their grave and were sacred to the memo...,-1,-1_man_could_up_myself,"[man, could, up, myself, one, over, which, you...",[i do not mean what i say you would never marr...,man - could - up - myself - one - over - which...,0.065399,False
...,...,...,...,...,...,...,...,...
3683,of you said estella have you of late very ofte...,2,2_estella_estellas_havisham_beautiful,"[estella, estellas, havisham, beautiful, richm...",[much said estella looking at me less coarse a...,estella - estellas - havisham - beautiful - ri...,0.176325,False
3684,that remembrance i have given it a place in my...,2,2_estella_estellas_havisham_beautiful,"[estella, estellas, havisham, beautiful, richm...",[much said estella looking at me less coarse a...,estella - estellas - havisham - beautiful - ri...,0.149995,False
3685,very glad to do so glad to part again estella ...,2,2_estella_estellas_havisham_beautiful,"[estella, estellas, havisham, beautiful, richm...",[much said estella looking at me less coarse a...,estella - estellas - havisham - beautiful - ri...,0.147932,False
3686,say that to me then you will not hesitate to s...,-1,-1_man_could_up_myself,"[man, could, up, myself, one, over, which, you...",[i do not mean what i say you would never marr...,man - could - up - myself - one - over - which...,0.546560,False


In [17]:
expectations_doc_info_df = pd.DataFrame(expectations_topic_model.get_document_info(docs))

In [18]:
expectations_doc_info_df.to_csv('expectations_docinfo.csv', index=True)

In [21]:
expectations_topic_model.get_topic(8)

[('pocket', np.float64(0.4049248084219291)),
 ('flopson', np.float64(0.35830616746495136)),
 ('baby', np.float64(0.3301816373886102)),
 ('mrs', np.float64(0.32361762185258214)),
 ('jane', np.float64(0.2994044172852345)),
 ('millers', np.float64(0.27995745887530554)),
 ('children', np.float64(0.26308392721862506)),
 ('babys', np.float64(0.2228768267346996)),
 ('tumbling', np.float64(0.22112481677662688)),
 ('pockets', np.float64(0.22099396457721204))]

In [25]:

fig = expectations_topic_model.visualize_topics()
output_file_path = "topics_visualization.html"
pio.write_html(fig, file=output_file_path)

print(f"Visualization saved to {output_file_path}")
fig.show()

Visualization saved to topics_visualization.html


In [26]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
fig = expectations_topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
output_file_path = "UMAP_topics_visualization.html"
pio.write_html(fig, file=output_file_path)

print(f"Visualization saved to {output_file_path}")
fig.show()

Visualization saved to UMAP_topics_visualization.html


In [28]:
expectations_topic_model.visualize_hierarchy()

In [29]:
hierarchical_topics = expectations_topic_model.hierarchical_topics(docs)

100%|██████████| 88/88 [00:00<00:00, 822.12it/s]


In [30]:
expectations_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [31]:
tree = expectations_topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─herbert_biddy_estella_pip_joe
│    ├─wemmick_trabbs_walworth_trabb_boar
│    │    ├─im_fault_forge_gentleman_blacksmith
│    │    │    ├─■──irrelevant_himor_rubbish_civilly_screened ── Topic: 81
│    │    │    └─fault_forge_gentleman_blacksmith_inquire
│    │    │         ├─gentleman_beautiful_genteel_publichouse_dreadfully
│    │    │         │    ├─■──gentleman_genteel_publichouse_worthy_asseverates ── Topic: 75
│    │    │         │    └─■──dreadfully_untoward_shouldered_miscreant_prize ── Topic: 43
│    │    │         └─■──fault_forge_blacksmith_inquire_anvil ── Topic: 38
│    │    └─wemmick_trabbs_walworth_trabb_boar
│    │         ├─wemmick_portable_nod_property_cringes
│    │         │    ├─■──portable_property_deliberate_theyre_owner ── Topic: 66
│    │         │    └─wemmick_nod_cringes_beats_clerks
│    │         │         ├─■──cringes_beats_cringe_bolder_wemmick ── Topic: 58
│    │         │         └─■──wemmick_nod_aged_london_australia ── Topic: 17
│    │         └─tra

In [33]:
expectations_topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)