In [3]:
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the parent directory of 'scrapping' to the system path
sys.path.append(os.path.abspath(os.path.join(current_dir, '..', 'scrapping')))

# Now you can import the function
from text_cleaner import read_and_clean_adrs

from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from markdown2 import markdown
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
import pyLDAvis.lda_model
import matplotlib.pyplot as plt

print(sys.path)

import warnings
warnings.filterwarnings('ignore')


# Path to the ADR directory
adr_directory = "../../data/ADRs-Updated"

['/Library/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/nikolakis/Library/Python/3.12/lib/python/site-packages', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages', '/Users/nikolakis/Projects/ADR-thesis/src/scrapping', '/Users/nikolakis/Projects/ADR-thesis/src/scrapping', '/Users/nikolakis/Projects/ADR-thesis/src/scrapping']


In [4]:
cleaned_texts = read_and_clean_adrs(adr_directory, save=False)

Topic models thereby aim to uncover the latent topics or themes characterizing a set of documents. In this way, topic models are a machine learning-based form of text analysis used to thematically annotate large text corpora

## TF-IDF + LDA (hyperparameters obtained from grid search)

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

# good results: 
# 1) n_components=4, learning_decay=0.5, learning_offset=10, max_df=0.9, min_df=5, max_features=2000
# 2) n_components=5, learning_decay=0.5, learning_offset=10, max_df=0.9, min_df=5, max_features=2000
# 3) n_components=4, learning_decay=0.5, learning_offset=10, max_df=0.9, min_df=5, max_features=1000

# Notes: From hyperparam tuning
# n_components=4, learning_decay=0.5, learning_offset=10, max_df=0.9, min_df=5, max_features=1000

# Define the vectorizer and LDA model
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5, max_features=1000)
lda = LatentDirichletAllocation(n_components=4, learning_decay=0.5, learning_offset=10, n_jobs=-1, max_iter=50)

X = tfidf_vectorizer.fit_transform(cleaned_texts)

lda.fit(X)

# Prominent Topics
- Topic 1: Cloud and Infrastructure --> prevalent in all tries
- Topic: Data and data storage --> prevalent in almost all tries
- Topic 2: Programming language and frameworks --> Have to dig in a bit with different hyperparameters
- Topic 3: Authentication and Security --> have to dig in a bit with different hyperparameters
- Topic 4: General architecrture and design (classes, apis)--> prevalent in all tries --> maybe focus on this and split up into different topics
- Topic 5: Linting, formating and conventions --> have to dig in a bit with different hyperparameters
- Topic 6: Building and releasing (CI/CD, Testing) --> have to dig in a bit with different hyperparameters
- Topic 7: ADRs about architectiral decisions --> seen with >4 topics

## Evaluate TF-IDF + LDA

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 20
print_top_words(lda, tfidf_vectorizer.get_feature_names_out(), n_top_words)

print("\n\nDistributions")
# see hoe many documents are in each topic
topic_distribution = lda.transform(X)
for i in range(4):
    print(f"Topic {i}: {np.sum(topic_distribution[:, i])}" + f" ({np.sum(topic_distribution[:, i]) / len(cleaned_texts) * 100:.2f}%)")

Topic 0:
service environment aws docker user application container deployment cluster image version access cloud kubernetes platform release branch build authentication run
Topic 1:
component test project code file library framework consequence architecture good language tool react testing change adr support style record javascript
Topic 2:
database log search email lambda govuk elasticsearch metric spring python logging postgres service postgresql prometheus mongodb rail django redis rds
Topic 3:
data api event user message type request object value model field service new client change error key function state query


Distributions
Topic 0: 1369.1647612727443 (25.51%)
Topic 1: 1751.2003755539972 (32.62%)
Topic 2: 369.64948496757 (6.89%)
Topic 3: 1877.9853782056884 (34.98%)


In [7]:
# Prepare to visualize
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(lda, X, tfidf_vectorizer, mds='tsne')
pyLDAvis.display(panel)

## Hyperparameter tuning for LDA and TF-IDF

In [None]:
# Define the vectorizer and LDA model
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
lda = LatentDirichletAllocation(random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('lda', lda)
])

# Define the parameter grid
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000, 4000],
    'lda__n_components': [4, 5, 6, 7, 8, 9, 10],
    'lda__learning_decay': [0.5, 0.7, 0.9],
    'lda__learning_offset': [10, 15, 20, 30]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(cleaned_texts)

# Evaluate the best model
print("Best Parameters: ", grid_search.best_params_)
best_lda_model = grid_search.best_estimator_

# Print top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.named_steps['lda'].components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 10
tfidf_feature_names = grid_search.best_estimator_.named_steps['tfidf'].get_feature_names_out()
print_top_words(best_lda_model, tfidf_feature_names, n_top_words)