In [4]:
import sys
import os

current_dir = os.getcwd()

# Add the parent directory of 'scrapping' to the system path
sys.path.append(os.path.abspath(os.path.join(current_dir, '..', 'scrapping')))

from text_cleaner import read_and_clean_adrs

from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from markdown2 import markdown
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pyLDAvis
import pyLDAvis.lda_model
import matplotlib.pyplot as plt
import json

print(sys.path)

import warnings
warnings.filterwarnings('ignore')


# Path to the ADR directory
adr_directory = "../../data/ADRs-Updated"

['/Library/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/nikolakis/Library/Python/3.12/lib/python/site-packages', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages', '/Users/nikolakis/Projects/ADR-thesis/src/scrapping', '/Users/nikolakis/Projects/ADR-thesis/src/scrapping']


In [7]:
# Iterate over all files in the directory
metadata_path = "../../data/ADR-Study-Dataset-Metadata/repositories"
total_adr_files = 0
for filename in os.listdir(metadata_path):
    if filename.endswith('.json'):
        file_path = os.path.join(metadata_path, filename)
        # Open and read the JSON file
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            # Add the value of numAdrFiles to the total
            total_adr_files += data.get('numAdrFiles', 0)

# Print the total number of ADR files
print(f'Total number of ADR files in metadata: {total_adr_files}')

Total number of ADR files in metadata: 6362


In [9]:
# count total number of files in the ADR directory
total_adr_files = 0
for filename in os.listdir(adr_directory):
    if filename.endswith('.md'):
        total_adr_files += 1
print(f'Total number of ADR files in directory: {total_adr_files}')

Total number of ADR files in directory: 5368


In [5]:
def clean_text(markdown_content):
    # Convert Markdown to HTML
    html_content = markdown(markdown_content)
    # Parse HTML to text
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text()
    # Remove non-alphabetic characters and lower the case
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # Tokenize, remove stop words, and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    # remove words with less than 3 characters
    tokens = [word for word in tokens if len(word) > 2]

    # remove common terms
    common_terms = ["context", "decision", "status", "consequences",
                     "motivation", "options", "option", "alternatives", "alternative"]
    
    tokens = [word for word in tokens if word not in common_terms]

    return ' '.join(tokens), tokens

# Most common terms in adr names

In [3]:
file_name_word_count = {}
for file_name in os.listdir(adr_directory):
    words = file_name.split('-')
    words = [word.replace('.md', '') for word in words]
    # convert to lowercase
    words = [word.lower() for word in words]
    # remove the numbers
    words = [word for word in words if not word.isdigit()]
    # remove the word ADR
    words = [word for word in words if word.lower() != 'adr']
    # keep words longer than 2 characters
    words = [word for word in words if len(word) > 2]
    # use limatizer
    words = [lemmatizer.lemmatize(word) for word in words]
    # remove stop words
    words = [word for word in words if word not in stop_words]
    for word in words:
        if word in file_name_word_count:
            file_name_word_count[word] += 1
        else:
            file_name_word_count[word] = 1

# Top 10 words in ADR file names
print("Top 10 words in ADR file names:")
for word, count in Counter(file_name_word_count).most_common(50):
    print(f"{word}: {count}")


Top 10 words in ADR file names:
use: 900
api: 163
data: 116
test: 110
service: 105
architecture: 99
file: 88
component: 80
code: 74
testing: 73
application: 70
framework: 67
module: 66
create: 62
decision: 61
user: 60
support: 59
store: 58
version: 58
frontend: 57
record: 57
add: 56
database: 56
react: 54
dependency: 54
structure: 54
management: 54
client: 53
docker: 52
aws: 52
event: 51
package: 50
app: 50
remove: 50
library: 49
implement: 49
server: 49
model: 47
storage: 47
using: 46
configuration: 45
language: 44
integration: 44
build: 44
config: 42
project: 42
authentication: 41
container: 40
source: 40
type: 40


## Average clean tokens (words) per ADR: 212.4236214605067

## Most frequent words inside adrs

In [6]:
all_words = []
for file_name in os.listdir(adr_directory):
    file_path = os.path.join(adr_directory, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        preprocessed_text, words = clean_text(text)
        all_words.extend(words)

word_freq = Counter(all_words)
print(word_freq.most_common(50))

[('use', 9933), ('need', 7712), ('data', 6911), ('user', 6667), ('service', 6332), ('new', 5297), ('code', 4958), ('would', 4925), ('consequence', 4836), ('using', 4815), ('api', 4752), ('change', 4633), ('application', 4588), ('file', 4503), ('date', 4360), ('type', 3842), ('component', 3827), ('used', 3823), ('one', 3769), ('also', 3718), ('accepted', 3625), ('time', 3579), ('version', 3528), ('test', 3408), ('support', 3372), ('make', 3157), ('project', 2904), ('may', 2853), ('message', 2797), ('example', 2785), ('adr', 2772), ('client', 2759), ('module', 2737), ('request', 2724), ('system', 2687), ('work', 2684), ('configuration', 2562), ('like', 2549), ('feature', 2520), ('set', 2485), ('could', 2475), ('way', 2458), ('value', 2435), ('create', 2421), ('key', 2387), ('implementation', 2375), ('case', 2353), ('error', 2336), ('different', 2322), ('name', 2269)]
