In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install BERTopic
!pip install sentence_transformers
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import os

import nltk
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer, util
from transformers.pipelines import pipeline

from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

In [None]:
#Read data.

data = pd.read_csv('/content/drive/MyDrive/Topic Modeling-GBM/data.csv', index_col = 0, encoding = 'latin1', low_memory = False)
tiab = data['Title + Abstract'].tolist()

print(data.shape)

In [None]:
#Define stop words.

nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))

stop_words += ["gbm", "glioblastoma", "multiforme", "brain", "tumor", "glioma", "gliomas", "glial", "neuro-oncology", "neuorooncology", "tumors", "neoplasia", "cancer", "patients", "patient", "years", "abstract", "nabstract", "no abstract", "available", "no", "reply", "editor", "editorial", "letter", "comment", "commentary", "comments", "authors", "study", "title", "background", "objective", "objectives", "introduction", "method", "methods", "material", "materials", "result", "results", "discussion", "conclusion", "conclusions", "case", "report", "meeting", "annual", "proceedings"]

vectorizer = CountVectorizer(stop_words=stop_words)

In [None]:
#Embeddings.

embedding_model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
embeddings = embedding_model.encode(tiab, show_progress_bar = True)

In [None]:
from bertopic.representation import PartOfSpeech

# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)

In [None]:
#Topic modeling.

topic_model = BERTopic(vectorizer_model = vectorizer, language = 'english', min_topic_size = 300, top_n_words = 50, n_gram_range = (1,3), calculate_probabilities = True, verbose = False, low_memory = False)
topics, probs = topic_model.fit_transform(tiab, embeddings)

topic_model.get_topic_info()

In [None]:
#Reduce outliers.

new_topics = topic_model.reduce_outliers(tiab, topics, probabilities=probs, threshold=0.05, strategy="probabilities")

topic_model.update_topics(tiab, topics = new_topics, vectorizer_model = vectorizer, top_n_words = 50)

topic_info = topic_model.get_topic_info()

count_list = topic_info['Count'].tolist()

topic_info

In [None]:
#Load the model.

model_path = "/content/drive/MyDrive/Topic Modeling-GBM/topic_model"
topic_model = BERTopic.load(model_path)

In [None]:
topics_to_merge = [0, 7]

topic_model.merge_topics(tiab, topics_to_merge)

topic_info = topic_model.get_topic_info()

count_list = topic_info['Count'].tolist()

topic_info

In [None]:
#Generate keywords.

keywords = topic_model.generate_topic_labels(nr_words=50, topic_prefix=False, separator=", ")

In [None]:
#Get representative documents for each topic and save the spreadsheet.

representative_docs = topic_model.get_representative_docs()
representative_docs = pd.DataFrame.from_dict(representative_docs)
representative_docs.columns = keywords
representative_docs = representative_docs.T
representative_docs.index.name = 'Key Words'
representative_docs.columns = ['Representative Document 1', 'Representative Document 2', 'Representative Document 3']
representative_docs['Topic Label'] = pd.Series()
representative_docs['Number of Documents'] = count_list
representative_docs = representative_docs.reset_index()
representative_docs = representative_docs.set_index('Topic Label')
representative_docs = representative_docs[['Key Words', 'Number of Documents', 'Representative Document 1', 'Representative Document 2', 'Representative Document 3']]
representative_docs.to_csv('/content/drive/MyDrive/Topic Modeling-GBM/representative_docs.csv')

In [None]:
#Save the topic model.

model_path = "/content/drive/MyDrive/Topic Modeling-GBM/topic_model"
topic_model.save(model_path)