# **Installing BERTopic**

We start by installing BERTopic from PyPi:

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
cd /content/gdrive/MyDrive/KeyBert

/content/gdrive/MyDrive/KeyBert


In [4]:
%%capture
!pip install bertopic

# Data
For this example, we use the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts

In [71]:
import time
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

import re
from typing import List, Optional, Union, Callable
import string

from bertopic import BERTopic

import numpy as np
import pandas as pd
from numpy import array

from wordcloud import WordCloud

from PIL import Image, ImageFont, ImageDraw
import matplotlib.pyplot as plt

import json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [85]:
def remove_stopwords(text):

  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(text)
  filtered_sentence = [w for w in word_tokens if  (w.lower() not in stop_words) and (len(w)>1)]
  
  filtered_text = ' '.join(filtered_sentence)
  return filtered_text

def remove_emoji(text): 
    emoji_pattern = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u200c"
                u'\u200f'
                u'\u200e'
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                u"\u202b"

                u"\U000E006E|" \
                u"\U000E007F|" \
                u"\U000E0073|" \
                u"\U000E0063|" \
                u"\U000E0074|" \
                u"\U000E0077|" \
                u"\U000E006C"

    "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r' ', text)

def remove_number(text):
    """ Remove number in the input text """
    processed_text = re.sub('\d+', '', text)
    return processed_text

def remove_url(text):
    """ Remove url in the input text """
    return re.sub('(www|http)\S+', '', text)

def remove_punctuation(input_text: str, punctuations: Optional[str] = None) -> str:
    """
    Removes all punctuations from a string, as defined by string.punctuation or a custom list.
    For reference, Python's string.punctuation is equivalent to '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~'
    """
    if punctuations is None:
        punctuations = string.punctuation
    processed_text = input_text.translate(str.maketrans('', '', punctuations))
    return processed_text


def remove_link(text): 
    return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(text))

def remove_tabs(text): 
    return re.sub(r'[\n\r\t]', '', str(text))

def remove_email(text): 
    return re.sub(r'\S+@\S+', '', str(text))

# def remove_englishword(text): 
#     return re.sub(r'[A-Za-z0-9]+', '', str(text))

def remove_chars(text): 
    # return re.sub(r'\.(?!\d)', '', str(text))
    return  re.sub(r'[$+&+;+]|[><!+،:,\(\).+،+٬+,+٬]|[-+]|[…]|[\[\]»«//]|[\\]|[#+]|[_+]|[٪+]|[%]|[*+]|[؟+]|[?+]|[""]|@|' '', '', str(text))

def remove_extraspaces(text):
    return re.sub(r' +', ' ', text)

def remove_extranewlines(text):
    return re.sub(r'\n\n+', '\n\n', text)

def handle_clear_more_triple_chars(text):
    # remove any char that appear more than 2 times Continuously except whitespaces, tabs and newlines
    doc_string=re.sub("(.)\\1{2,}", "\\1", text)
    return doc_string



def remove_unusful_topics(topics):
    return {x: y for x, y in topics.items() if not x.endswith("های") and not x.endswith("هایی")}

def remove_nested_keywords(topics):

    keys = list(topics.keys())
    values = list(topics.values())
    for i, sub_item in enumerate(keys):
        for j, item in enumerate(keys):
            if i==j:
                continue
            elif sub_item in item:
                keys[i] = "-1-1+1"
                values[i] = "-1-1+1"

    keys = list(filter(lambda a: a != "-1-1+1", keys))
    values = list(filter(lambda a: a != "-1-1+1", values))

    new = {}
    for i in range(len(keys)):
        try:
            new_key = keys[i]
            if len(new_key.split())>1 and new_key.split().count("_") > 1 and not new_key.split()[-1]!="_" and not new_key.split()[0] != "_":
                new_key = '\u200c'.join(new_key.split())
            if len(new_key) > 3:
                new[new_key] = values[i]
        except: continue
    return new



def preprocess(text):
    text = remove_emoji(text)

    text = remove_link(text)
    text = remove_tabs(text) 
    text = remove_email(text) 
    # text = remove_englishword(text) 
    text = remove_chars(text)
    text = remove_extraspaces(text) 
    text = remove_extranewlines(text) 
    text = remove_number(text)
    text = remove_url(text)
    text = remove_punctuation(text)

    text = remove_stopwords(text)
    text = handle_clear_more_triple_chars(text)
    return text

In [48]:
def wordcloud(topics):
        image_address = "/content/gdrive/MyDrive/TopicModeling/English_BERTopic.png"
        mask = np.array(Image.open("/content/gdrive/MyDrive/TopicModeling/mask-instagram.png"))
        wordcloud = WordCloud(max_font_size=80, background_color="white", font_path="/content/gdrive/MyDrive/TopicModeling/Vazir-Bold.ttf",
                              max_words=80, mask=mask, 
                              margin=10, height=800, width=800, colormap="Dark2", prefer_horizontal=1)
        # new_topics = {}
        # # box_size = draw.textsize(word, font=transposed_font)
        # for x, y in topics.items():
        #     try:
        #         new_topics[get_display(arabic_reshaper.reshape(x))] = y
        #     except: continue
        # print(new_topics)
        # topics = new_topics
        wordcloud.generate_from_frequencies(topics)
        wordcloud.to_file(image_address)
        image = Image.open(image_address)
        image.thumbnail((800, 800), Image.ANTIALIAS)
        image = image.save(image_address, 'png', quality=100)
        # plt.imshow(wordcloud)
        plt.show()
        
        return topics

##Finetune Bertopic model

In [78]:
df = pd.read_csv("english_data.csv")
df = df[['text']]
df = df.dropna()
df = df.drop_duplicates()
# df = df.reset_index(drop=True)
# print(f'We have #{len(df)} news!')

df['text'] = df['text'].apply(preprocess)
# df['text'] = df['text'].dropna()
df['text']= pd.DataFrame(df[df['text'].map(len) > 2]["text"])
df = pd.DataFrame(df['text'].dropna()).reset_index()

docs = df['text'].tolist()

model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = model.fit_transform(docs)

model.save("/content/gdrive/MyDrive/TopicModeling/English_BERTopic")
my_model = BERTopic.load("/content/gdrive/MyDrive/TopicModeling/English_BERTopic")
topics, probs = my_model.transform(docs)

Batches:   0%|          | 0/570 [00:00<?, ?it/s]

2022-11-15 21:20:57,539 - BERTopic - Transformed documents to Embeddings
2022-11-15 21:21:22,265 - BERTopic - Reduced dimensionality
2022-11-15 21:21:50,896 - BERTopic - Clustered reduced embeddings


Batches:   0%|          | 0/570 [00:00<?, ?it/s]

2022-11-15 21:22:33,066 - BERTopic - Reduced dimensionality
2022-11-15 21:23:04,592 - BERTopic - Calculated probabilities with HDBSCAN
2022-11-15 21:23:04,594 - BERTopic - Predicted clusters


##Evaluate model

In [86]:
def my_run(file_name,bertmodel):

  en_stop_words = set(stopwords.words('english'))

  print("strat ...")
  start_time = time.time()

  df = pd.read_csv(file_name)
  df = df[['text']]
  df = df.dropna()
  df = df.drop_duplicates()
  # df = df.reset_index(drop=True)
  # print(f'We have #{len(df)} news!')

  df['text'] = df['text'].apply(preprocess)
  # df['text'] = df['text'].dropna()
  df['text']= pd.DataFrame(df[df['text'].map(len) > 2]["text"])
  df = pd.DataFrame(df['text'].dropna()).reset_index()

  docs = df['text'].tolist()

  # topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
  # topics, probs = topic_model.fit_transform(docs)


  my_model = BERTopic.load(bertmodel)
  new_topics, new_probs = my_model.transform(docs)
  # topics = topic_model.get_topic(0)


  result = {}
  for d in (my_model.get_topics()).values():
    result.update(d)


  # inv_map = dict(new_topics)
  # inv_map = {v: k for k, v in rs.items()}


  result= remove_unusful_topics(result)
  result= remove_nested_keywords(result)

  wordcloud(result)
  return result

result = my_run("./english_data.csv","/content/gdrive/MyDrive/TopicModeling/English_BERTopic")
result

strat ...


Batches:   0%|          | 0/570 [00:00<?, ?it/s]

2022-11-15 21:38:38,075 - BERTopic - Reduced dimensionality
2022-11-15 21:39:10,605 - BERTopic - Calculated probabilities with HDBSCAN
2022-11-15 21:39:10,606 - BERTopic - Predicted clusters


{'also': 0.003308635143318614,
 'dont': 0.008877120470033611,
 'games': 0.07900722732022492,
 'hockey': 0.009453673665428751,
 'players': 0.009410298529480436,
 'season': 0.008885084199227967,
 'league': 0.007359465626146389,
 'launch': 0.013800109774547206,
 'orbit': 0.010934759907212759,
 'solar': 0.009677229173702812,
 'shuttle': 0.04762636286949632,
 'spacecraft': 0.00926589582025552,
 'satellite': 0.008949565234376799,
 'moon': 0.008602088895789764,
 'encryption': 0.0204733801827315,
 'clipper': 0.015514623885915065,
 'chip': 0.015363379290033605,
 'keys': 0.03625138494807538,
 'algorithm': 0.017297489674483506,
 'security': 0.010610285451824398,
 'escrow': 0.010467364437271818,
 'israeli': 0.021343895933644504,
 'jews': 0.016265722872693296,
 'jewish': 0.012027202329459325,
 'arabs': 0.0117056877776665,
 'palestinians': 0.008373672746657053,
 'gaza': 0.007761798230963539,
 'entry': 0.021100313196882072,
 'anonymous': 0.016071175865027636,
 'internet': 0.01472656884883614,
 'email

## Training




**NOTE**: Use `language="multilingual"` to select a model that support 50+ languages.

## Extracting Topics


In [27]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,6861,-1_one_would_image_like
1,0,1821,0_game_team_games_hockey
2,1,627,1_key_encryption_clipper_chip
3,2,603,2_space_launch_orbit_nasa
4,3,475,3_window_widget_xr_xterm


-1 refers to all outliers and should typically be ignored. Next, let's take a look at a frequent topic that were generated:

In [28]:
topic_model.get_topic(0)  # Select the most frequent topic

[('game', 0.015584233311880476),
 ('team', 0.013704586125662542),
 ('games', 0.01088149049956661),
 ('hockey', 0.00958978694027773),
 ('players', 0.009457722693097349),
 ('play', 0.009073529355169328),
 ('season', 0.008972285628380948),
 ('year', 0.007932609722290934),
 ('win', 0.007725499629583828),
 ('league', 0.007480745490791773)]

In [31]:
topic_model.get_topic(12)  # Select the most frequent topic

[('db', 0.05643315208252955),
 ('windows', 0.024411521660304338),
 ('dos', 0.015860334706841707),
 ('os', 0.01380788196599447),
 ('system', 0.01195847305835855),
 ('software', 0.011488423862941575),
 ('ms', 0.01079955937595632),
 ('nt', 0.01055648460645777),
 ('keyboard', 0.01028469094942337),
 ('files', 0.009822894659207061)]

In [32]:
topic_model.get_topic(13)  # Select the most frequent topic

[('format', 0.046085971668760925),
 ('files', 0.04376244675770305),
 ('gif', 0.042405123387024686),
 ('bmp', 0.035856115343815616),
 ('convert', 0.03133361109595587),
 ('file', 0.02153535384367118),
 ('iff', 0.018269106067759545),
 ('formats', 0.01641178834974322),
 ('pcx', 0.016313880722151217),
 ('image', 0.015118805814111073)]

In [33]:
topic_model.get_topic(177)  # Select the most frequent topic

[('cview', 0.1169377910735705),
 ('cluster', 0.062242381280943476),
 ('directory', 0.0528722667270269),
 ('temp', 0.050139992249596976),
 ('files', 0.049510315689674095),
 ('fat', 0.047612513774989144),
 ('disk', 0.04493317064555448),
 ('file', 0.04064940467581848),
 ('data', 0.039837768824179096),
 ('dir', 0.02314856774258216)]

**NOTE**: BERTopic is stocastich which mmeans that the topics might differ across runs. This is mostly due to the stocastisch nature of UMAP.

In [None]:
### Attributes

## Attributes

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used.                                                |

For example, to access the predicted topics for the first 10 documents, we simply run the following:

In [None]:
topic_model.topics_[:10]

[0, 133, 45, 4, 107, -1, -1, 0, 0, -1]

## Visualize Topics


In [29]:
topic_model.visualize_topics()

## Visualize Topic Probabilities


In [30]:
topic_model.visualize_distribution(probs[200], min_probability=0.015)

## Visualize Topic Hierarchy



In [34]:
topic_model.visualize_hierarchy(top_n_topics=50)

## Visualize Terms



In [35]:
topic_model.visualize_barchart(top_n_topics=5)

## Visualize Topic Similarity


In [36]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

## Visualize Term Score Decline



In [None]:
topic_model.visualize_term_rank()

## Update Topics


In [39]:
topic_model.update_topics(docs, n_gram_range=(2, 2))

In [54]:
topic_model.get_topic(0)   # We select topic that we viewed before

[('power play', 0.005339318939373138),
 ('last year', 0.004717166250942099),
 ('st louis', 0.003976820621201657),
 ('scorer pts', 0.003322787531397074),
 ('pts pt', 0.0030761880199522656),
 ('pt la', 0.0028970940024435343),
 ('play scorer', 0.0028247207005680642),
 ('los angeles', 0.0027191067201109364),
 ('first period', 0.002634728643304001),
 ('red sox', 0.002567856671986171)]

## Topic Reduction





In [55]:
import json
rs = json.dumps(dict(topic_model.get_topic(0)))
rs

'{"power play": 0.005339318939373138, "last year": 0.004717166250942099, "st louis": 0.003976820621201657, "scorer pts": 0.003322787531397074, "pts pt": 0.0030761880199522656, "pt la": 0.0028970940024435343, "play scorer": 0.0028247207005680642, "los angeles": 0.0027191067201109364, "first period": 0.002634728643304001, "red sox": 0.002567856671986171}'

In [41]:
topic_model.reduce_topics(docs, nr_topics=60)

2022-11-15 19:40:53,558 - BERTopic - Reduced number of topics from 179 to 61


<bertopic._bertopic.BERTopic at 0x7f790cd6e490>

In [None]:
# Access the newly updated topics with:
print(topic_model.topics_)

[0, 30, 49, 3, 3, -1, -1, 0, 0, -1, -1, -1, -1, -1, 16, 20, 3, 8, -1, 6, -1, -1, 44, 6, 0, 16, 8, -1, -1, 14, 5, 51, -1, 0, 23, 19, -1, -1, 6, -1, -1, 42, 57, 7, 0, -1, -1, 9, 1, -1, -1, 5, 51, 1, -1, 6, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, 0, -1, -1, -1, 32, -1, 41, -1, -1, -1, 0, 36, 7, 0, 53, 5, -1, 57, 31, -1, -1, 15, -1, -1, 0, 2, -1, 30, -1, 32, -1, -1, 8, 7, -1, -1, -1, 4, 2, 0, -1, -1, 9, -1, 19, -1, 11, 10, -1, 49, -1, -1, 0, -1, -1, -1, -1, 34, -1, -1, -1, 2, -1, -1, -1, -1, -1, 0, 3, 2, -1, 1, 3, -1, -1, 19, -1, -1, -1, -1, 19, 37, 0, -1, 6, 6, 3, -1, -1, -1, -1, 4, 11, 17, -1, 2, -1, -1, -1, -1, -1, -1, -1, 23, 25, -1, 26, -1, 7, -1, 0, -1, 6, 0, -1, 0, -1, -1, -1, 2, -1, -1, 49, -1, 56, -1, 2, 12, -1, -1, 59, -1, -1, -1, -1, -1, -1, -1, -1, -1, 7, -1, 55, 12, -1, 10, -1, 3, 6, 8, -1, -1, -1, 2, 0, 24, 3, 5, 1, -1, 1, 6, -1, 0, 6, -1, -1, 7, 0, -1, -1, 0, 0, -1, -1, -1, 23, -1, -1, 9, 2, 5, -1, 27, -1, -1, 7, -1, -1, -1, 34, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 

# **Search Topics**


In [42]:
similar_topics, similarity = topic_model.find_topics("vehicle", top_n=5); similar_topics

[8, 23, 42, 53, 43]

In [46]:
topic_model.get_topic(23)

[('first bike', 0.014747261774893477),
 ('many miles', 0.010673780082384034),
 ('bike know', 0.00940129380872954),
 ('newused motorcycles', 0.00940129380872954),
 ('guide newused', 0.00940129380872954),
 ('bike miles', 0.00916400869244655),
 ('street bike', 0.008970139681676304),
 ('new bike', 0.008970139681676304),
 ('stock seat', 0.007280414002181825),
 ('honda shadow', 0.007280414002181825)]

# **Model serialization**


In [44]:
# Save model
topic_model.save("my_model")	

In [45]:
# Load model
my_model = BERTopic.load("my_model")	

# **Embedding Models**


## Sentence-Transformers
You can select any model from sentence-transformers here and pass it through BERTopic with embedding_model:



In [None]:
topic_model = BERTopic(embedding_model="xlm-r-bert-base-nli-stsb-mean-tokens")

Or select a SentenceTransformer model with your own parameters:


In [None]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens", device="cpu")
topic_model = BERTopic(embedding_model=sentence_model, verbose=True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Click [here](https://www.sbert.net/docs/pretrained_models.html) for a list of supported sentence transformers models.  
