In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load a pre-trained model from sentence-transformers
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Path to the ADR directory
adr_directory = "../data/ADRs-Updated"


# Considerations

- Max sequence length --> if records are longer than 512 --> split them up and aggregate?
- Clean up text --> remove common words, phrases, keep only what has meaning for better clustering --> use libraries to remove markdown elements, common words etc
- See interpretation for clusters etc --> read papers on that

In [None]:
# Function to preprocess text (if not already done)
def preprocess_text(text):
    # Custom preprocessing if needed
    return text

In [None]:
# List to store filenames and their corresponding embeddings
data = []

# Read, preprocess, and encode each ADR file
count = 0
total_files = len(os.listdir(adr_directory))
for file_name in os.listdir(adr_directory):
    if file_name.endswith('.md'):
        file_path = os.path.join(adr_directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            preprocessed_text = preprocess_text(text)
            embedding = model.encode(preprocessed_text)
            print(f"Embedding {count+1}/{total_files} for {file_name} has been generated.")
            data.append((file_name, embedding))
            count += 1

# Create a DataFrame with filenames and their embeddings
df = pd.DataFrame(data, columns=['filename', 'embedding'])

print(df.head())

print("Embeddings have been successfully saved.")

In [None]:
# perform k means clustering on the embeddings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Number of clusters
k = 5

# Extract embeddings
X = df['embedding'].to_list()

# Perform k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
df['cluster'] = kmeans.labels_

# Save the clusters
df.to_csv('clusters.csv', index=False)

print("Clusters have been successfully saved.")


In [None]:
# visualize the clusters
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Perform PCA on the embeddings
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Create a DataFrame with the PCA components
df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
df_pca['cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(10, 10))
for cluster in range(k):
    cluster_df = df_pca[df_pca['cluster'] == cluster]
    plt.scatter(cluster_df['PCA1'], cluster_df['PCA2'], label=f'Cluster {cluster}')
plt.legend()
plt.title('Clusters of ADRs')
plt.savefig('clusters.png')