In [None]:
import sys
import os
# Get the current working directory
current_dir = os.getcwd()
# Add the parent directory of 'scrapping' to the system path
sys.path.append(os.path.abspath(os.path.join(current_dir, '..', 'scrapping')))
from text_cleaner import read_and_clean_adrs

from nltk.corpus import stopwords
from markdown2 import markdown
import openai
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, OpenAI
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
import datamapplot
import pandas as pd
import seaborn

import warnings
warnings.filterwarnings('ignore')

# load environment variables
from dotenv import load_dotenv
load_dotenv()

# Path to the ADR directory
adr_directory = "../../data/ADRs-Updated"

In [None]:
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <topic label>
the topic label must be at most 5 words long strictly, it must not contain specific names of technologies or programming languages. The documents are related to software development. They are architectural
decision records (ADRs) that describe the decisions made in the development of software systems. The keywords are extracted from the documents.
The topic labels should be general enough but will need to differantiate between
decision categories such as component decisions, programming language or framework decisions,
security, performance, scalability, infrastructure, deployment, testing, formating, standands, etc.
"""

# GPT as representation model ~0.10$ per run
client = openai.OpenAI(api_key=os.getenv("OPEN_AI_API_KEY"))
open_ai_repr_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, prompt=prompt, tokenizer="vectorizer")

In [None]:
# The main representation of a topic
# Use a KeyBERT-like model to fine-tune the topic representations
# The algorithm follows KeyBERT but does some optimization in order to speed up inference.
keybert_repr = KeyBERTInspired()

# Add all models together to be run in a single `fit`
representation_model = {
   "Main": open_ai_repr_model,
   "Keywords":  keybert_repr,
}

In [None]:
# create a list of embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
outlier_embeddings = embedding_model.encode(outlier_documents, show_progress_bar=True)

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
# A higher min_cluster_size will generate fewer topics and a lower min_cluster_size will generate more topics.
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
bert_topic_1 = BERTopic(
                        # hyperparameters 
                        language="english", # language of the documents
                        # nr_topics=10, # number of topics to output (this reduces topic AFTER they have been discovered)
                        top_n_words=10, # number of top words per topic
                        n_gram_range=(1, 2), # number of words per n-gram (n_grams are phrases of n words)
                        min_topic_size=12,  # minimum number of data points per topic (more = less topics)
                        # models and embeddings
                        umap_model=umap_model, 
                        hdbscan_model=hdbscan_model,
                        representation_model=representation_model,
                        embedding_model=embedding_model,
                        # other
                        calculate_probabilities=True, # calculate the probs of a document belonging to a topic (slows down training, use when good results are found)
                        )
topics, probs = bert_topic_1.fit_transform(outlier_documents)

In [None]:
bert_topic_1.visualize_documents(outlier_documents, embeddings=outlier_embeddings)