In [None]:
from dotenv import load_dotenv
from phmlondon.snow_utils import SnowflakeConnection
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
import umap
from sklearn.cluster import AgglomerativeClustering, DBSCAN, OPTICS
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [None]:
load_dotenv()
conn = SnowflakeConnection()
conn.use_database("INTELLIGENCE_DEV")
conn.use_schema("AI_CENTRE_DEV")

In [None]:
code_frequency_cutoff = 100
sample_size_for_clustering = 5000

In [None]:
counts_by_patient = conn.session.sql("""
select o.patient_id, o.core_concept_id, count(o.core_concept_id) as code_count, c.name
from prod_dwh.analyst_primary_care.observation as o
join prod_dwh.analyst_primary_care.concept as c
on o.core_concept_id = c.dbid
where c.name like '%(disorder)%' 
and o.clinical_effective_date >= DATEADD(YEAR, -10, CURRENT_DATE)
group by o.patient_id, o.core_concept_id, c.name;
""").to_pandas()

In [None]:
counts_by_patient

In [None]:
counts_by_patient.sort_values(by="CODE_COUNT", ascending=False).head(30)

In [None]:
total_code_counts = counts_by_patient.groupby(["CORE_CONCEPT_ID", "NAME"]).agg({"CODE_COUNT": "sum"}).sort_values(by="CODE_COUNT", ascending=False).reset_index()


In [None]:
total_code_counts.head(30)

In [None]:
total_code_counts.tail(30)

In [None]:
# remove any codes that haven't occured in at least 100 patients - should later experiment with removing this step
code_occurences_by_patient = counts_by_patient['CORE_CONCEPT_ID'].value_counts()
codes_to_include = code_occurences_by_patient[code_occurences_by_patient > code_frequency_cutoff].index
counts_by_patient_filtered = counts_by_patient[counts_by_patient['CORE_CONCEPT_ID'].isin(codes_to_include)]
counts_by_patient_filtered.head()

In [None]:
# Log normalise
counts_by_patient_filtered = counts_by_patient_filtered.copy()
counts_by_patient_filtered['LOG_CODE_COUNT'] = counts_by_patient_filtered['CODE_COUNT'].apply(lambda x: np.log(x))

In [None]:
counts_by_patient_filtered.head()

In [None]:
num_patients = counts_by_patient_filtered['PATIENT_ID'].nunique()
num_concepts = counts_by_patient_filtered['CORE_CONCEPT_ID'].nunique()
print(f"Unique PATIENT_IDs: {num_patients}")
print(f"Unique CORE_CONCEPT_IDs: {num_concepts}")


In [None]:
# Reduce the number of data points by random selection
counts_by_patient_filtered = counts_by_patient_filtered.sample(sample_size_for_clustering)
len(counts_by_patient_filtered)


In [None]:
patients = pd.Categorical(counts_by_patient_filtered['PATIENT_ID'])
concepts = pd.Categorical(counts_by_patient_filtered['CORE_CONCEPT_ID'])
reshaped_data = csr_matrix(
    (counts_by_patient_filtered['LOG_CODE_COUNT'], (patients.codes, concepts.codes))
)

In [None]:
print(reshaped_data)

In [None]:
#  Dimensionality reduction using truncated SVD (better than PCA for sparse matrices)
n_components = 50
svd = TruncatedSVD(n_components=n_components, random_state=42)

In [None]:
X_reduced = svd.fit_transform(reshaped_data)
print(f"Explained variance ratio: {svd.explained_variance_ratio_}")

In [None]:
cluster_labels_by_optics = OPTICS(min_samples=10, max_eps=50).fit_predict(X_reduced)

In [None]:
cluster_labels_by_dbscan = DBSCAN(eps=50, min_samples=10).fit_predict(X_reduced)

# dbscan = DBSCAN(eps=0.5, min_samples=5)
# dbscan_labels = dbscan.fit_predict(X_reduced)

In [None]:
cluster_labels_by_knn = KNeighborsClassifier

In [None]:
patient_clusters = pd.DataFrame({
    'PATIENT_ID': patients.categories,  # Original PATIENT_IDs
    'CLUSTER_LABEL': np.array(cluster_labels_by_dbscan)      # Corresponding cluster labels
})
patient_clusters.head(30)

In [None]:
clustered_data = counts_by_patient_filtered.merge(patient_clusters, on='PATIENT_ID', how='left')
print(clustered_data.head())

In [None]:
# Reduce the dimensions to 2 using UMAP
num_samples = 200
random_indices = np.random.choice(X_reduced.shape[0], num_samples, replace=False)
sample_of_X_reduced = X_reduced[random_indices]
reducer = umap.UMAP(n_components=2, random_state=42)
X_vis = reducer.fit_transform(sample_of_X_reduced)


In [None]:

mask = cluster_labels_by_dbscan[random_indices] != -1
X_filtered = X_vis[mask] 
dbscan_labels_filtered = cluster_labels_by_dbscan[random_indices][mask] 

In [None]:
# DBSCAN memory issues
# https://stackoverflow.com/questions/44131411/dbscan-handling-big-data-crashes-and-memory-error

In [None]:

n_clusters = len(set(cluster_labels_by_dbscan) - {-1})

print(f"Number of clusters found: {n_clusters}")

In [None]:
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=dbscan_labels_filtered, cmap='viridis', s=5)

<!-- to do  -->
code in thresholds
seed random number for plot
check works fullsize
details on plot?
explainability
modelling

In [None]:



most_popular_code = {patient: reshaped_data.loc[patient].idxmax() for patient in reshaped_data.index}
clustered_data['most_popular_code'] = clustered_data['PATIENT_ID'].map(most_popular_code)

In [None]:
most_popular_code_for_sql = ",".join(map(str,clustered_data['most_popular_code']))
print(most_popular_code_for_sql)

In [None]:
names = [most_popular_code_names.loc[most_popular_code_names['DBID'] == code]['NAME'] 
         for code in clustered_data['most_popular_code']]

In [None]:
for cluster in range(n_clusters):
    print(f"Cluster {cluster}")
    print(counts_by_patient_labeled.loc[counts_by_patient_labeled['CLUSTER_LABEL'] == cluster, 'most_popular_code_name'])