In [None]:
!pip install bertopic
!pip install datasets
!pip install sentence_transformers

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic import BERTopic
import nltk

In [None]:
data = load_dataset("SocialGrep/the-reddit-dataset-dataset", 'posts')

Downloading data:   0%|          | 0.00/5.09M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20292 [00:00<?, ? examples/s]

In [None]:
data = data['train']

In [None]:
data = data.filter(
    lambda x: True if len(x['selftext']) > 20 else 0
)

Filter:   0%|          | 0/20292 [00:00<?, ? examples/s]

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
nltk.download('stopwords')

stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(data['selftext'])

2024-01-11 20:30:06,666 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/342 [00:00<?, ?it/s]

2024-01-11 20:44:06,226 - BERTopic - Embedding - Completed ✓
2024-01-11 20:44:06,229 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-11 20:44:38,535 - BERTopic - Dimensionality - Completed ✓
2024-01-11 20:44:38,542 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-11 20:44:40,682 - BERTopic - Cluster - Completed ✓
2024-01-11 20:44:40,693 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-11 20:44:44,906 - BERTopic - Representation - Completed ✓


In [None]:
freq = model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3130,-1_data_dataset_would_www,"[data, dataset, would, www, looking]","[Show off, complain, and generally have a chat..."
1,0,813,0_data_census_looking_population,"[data, census, looking, population, find]","[New to data here, so I'm quite confused somet..."
2,1,802,1_data_covid_dataset_19,"[data, covid, dataset, 19, find]",[###The problem?\nEuropean countries do not pu...
3,2,792,2_data_companies_looking_would,"[data, companies, looking, would, find]",[I am looking for a dataset that would include...
4,3,547,3_data_datasets_dataset_would,"[data, datasets, dataset, would, find]","[Hi all,\n\nThis is my first Reddit post ever!..."
5,4,533,4_dataset_data_find_anyone,"[dataset, data, find, anyone, www]",[I have found the following datasets:\n\n1. tr...
6,5,387,5_player_data_game_football,"[player, data, game, football, team]","[Hey guys,\n\nI'm building a panel dataset for..."
7,6,290,6_traffic_dataset_car_data,"[traffic, dataset, car, data, vehicles]",[I'm looking for a dataset which contains vide...
8,7,290,7_lt_music_gt_songs,"[lt, music, gt, songs, gt lt]",[I'm searching for some uncompressed audio. Th...
9,8,280,8_tweets_twitter_data_sentiment,"[tweets, twitter, data, sentiment, dataset]","[Hello,\n\nI found a lot of dataset with label..."


In [None]:
model.visualize_barchart()