# Training Unsupervised Models
## 04_unsupervised_model_learning

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025   | Adrienne | Update | Creating models |
| 05/10/2025 | Adrienne | Update | Created baseline model with KMeans |
| 07.10.2025 | Adrienne | Update | Added code for wordcloud visualization |

# Content

* [Introduction](#introduction)

In [148]:


import pandas as pd
import numpy as np

# train test split
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

#unsupervised learning methods
# Feature agglomeration uses agglomerative(or hierarchical) clustering to group similar features, so it has its own dimensionality reduction technique
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration, DBSCAN
from sklearn.mixture import GaussianMixture

# dimensionality reduction methods
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# model selection
from sklearn.model_selection import GridSearchCV

# model scores
from sklearn.metrics import silhouette_score, calinski_harabasz_score

import matplotlib.pyplot as plt

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

## Load Data

In [143]:
path = "../data/clean"

df_lab_enc = pd.read_pickle(f"{path}/patient_level_lab_enc.pkl")

In [149]:
df_patient = pd.read_pickle(f"{path}/patient_level.pkl")

In [153]:
mapper_path = "../data/mappers"
icd10_mapper = pd.read_pickle(f"{mapper_path}/icd10.pkl")

## Model Development

We will create a baseline model using the label encoded patient level file.  Variations on the model will be added and then performance compared

### Baseline Model - KMeans

Todo: 
- find optimal clusters
- examine most important feature by principal component
 - create graph of clusters with principal components
 - optimize any model inputs

In [144]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_lab_enc)

Dimensionality Reduction

In [145]:
# Perform PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

In [None]:
components = pca.components_
feature_names = df_lab_enc.columns
feature_contributions = pd.DataFrame(components, columns=feature_names)

print(feature_contributions)


In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))

# Plot the explained variance ratio in the first subplot
ax1.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
ax1.set_xlabel("Principal Component")
ax1.set_ylabel("Explained Variance Ratio")
ax1.set_title("Explained Variance Ratio by Principal Component")

cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Plot the cumulative explained variance in the second subplot
ax2.plot(
    range(1, len(cumulative_explained_variance) + 1),
    cumulative_explained_variance,
    marker="o",
)
ax2.set_xlabel("Number of Principal Components")
ax2.set_ylabel("Cumulative Explained Variance")
ax2.set_title("Cumulative Explained Variance by Principal Components")

# Display the figure
plt.tight_layout()
plt.show()

In [None]:
# number of components
n_pcs= pca.components_.shape[0]

# get the index of the most important feature on EACH component i.e. largest absolute value
# using LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = df_lab_enc.columns

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# using LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i+1): most_important_names[i] for i in range(n_pcs)}

In [None]:
# build the dataframe - investigate 
df = pd.DataFrame(sorted(dic.items()))
df

In [None]:
principal_df = pd.DataFrame(data= X_pca)

In [None]:

plt.figure(figsize=(8, 6))
plt.scatter(principal_df[0], principal_df[1])
plt.title('PCA Result')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.show()


In [146]:
# kmeans code
kmeans = KMeans(init = 'random', n_clusters=12, n_init=10, random_state = 0) 
model = kmeans.fit(X_pca) 
model.n_iter_
labels = model.labels_
#print(labels)


In [147]:
df_lab_enc['cluster'] = labels

  df_lab_enc['cluster'] = labels


In [None]:
# quick look at cluster values
vals = df_lab_enc['cluster'].value_counts(normalize=True) * 100
pd.DataFrame({
  'cluster': vals
}).head(12)

In [None]:
principal_df = pd.DataFrame(data= X_pca, columns = ['principal_comp_1', 'principal_comp_2', 'principal_comp_3', 'principal_comp_4', 'principal_comp_5'])

In [None]:
principal_df.head()

In [None]:
plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("XXX",fontsize=20)
targets = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
#targets = [ 0, 1, 2, 3]
colors = ['r', 'g', 'b', 'y', 'r', 'g', 'b', 'y', 'r', 'g', 'b', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = df_lab_enc['cluster'] == target
    print(indicesToKeep)
    plt.scatter(principal_df.loc[indicesToKeep, 'principal_comp_1']
               , principal_df.loc[indicesToKeep, 'principal_comp_2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})


In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x= df_lab_enc.iloc[:, 2], y=df_lab_enc.iloc[:, 3], c= model.labels_)

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x= df_lab_enc.loc[:, 'hcpcs_0_enc'], y=df_lab_enc.loc[:, 'number_of_claims'], c= model.labels_)
# plt.xlabel('Annual Income (k$)')
# plt.ylabel('Spending Score (1-100)')

In [None]:
# Assign the cluster centers: centroids
centroids = model.cluster_centers_# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x,centroids_y,marker='D',s=50)
plt.show()

In [None]:
for i, cluster in enumerate(clusters):
    plt.scatter(data[:, 0], data[:, 1], c=cluster, label=f'Cluster {i+1}')
plt.scatter(*centroids, c='red', marker='x', s=200, label='Centroids')
plt.legend()
plt.show()


### Hierarchical Clustering

In [None]:
# works for AgglomerativeClustering, FeatureAgglomerization and DBScan
from sklearn.cluster import AgglomerativeClustering
agg_clustering = AgglomerativeClustering(n_clusters=3)
labels = agg_clustering.fit_predict(X_train)


### Guassian Mixture

In [None]:
# GMM code
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3, random_state=0)
gmm.fit(X)
labels = gmm.predict(X_train)

Model Evaluation

In [None]:
# elbow method

# works for all sklearn unsupervised model evaluation metrics with DBSCAN
score = silhouette_score(X_train, labels)

Model Selection

In [None]:
# example grid search

param_grid = {n_components: [2, 5, 8, 10]} # easily add another parameter to this structure

grid_search = GridSearchCV(
    estimator=KMeans(random_state=42, n_init='auto'),
    param_grid=param_grid,
    scoring=silhouette_scorer,
    n_jobs=-1
)

grid_search.fit(X)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Visualizations:

word cloud 


In [151]:
# merge primary diagnosis column back onto data now that it has been through learning
df_patient_test = df_patient[['patient_medicare_number', 'combined_principal_diagnosis_ls']]

df_patient_test['principal_unq'] = df_patient_test['combined_principal_diagnosis_ls'].apply(lambda x: list(set(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_patient_test['principal_unq'] = df_patient_test['combined_principal_diagnosis_ls'].apply(lambda x: list(set(x)))


In [155]:
icd10_mapper.head()

Unnamed: 0,CODE,SHORT DESCRIPTION,LONG DESCRIPTION,NF EXCL
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol...",
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor",
2,A009,"Cholera, unspecified","Cholera, unspecified",
3,A0100,"Typhoid fever, unspecified","Typhoid fever, unspecified",
4,A0101,Typhoid meningitis,Typhoid meningitis,


In [184]:
# create work cloud column of diagnosis descriptions

for index, row in df_patient_test.head(10).tail(2).iterrows():
    principal_text_ls = []
    print(row['principal_unq'])
    if len(row['principal_unq']) > 1:
        row_ls = []
        for code in row['principal_unq']:
            print(code)
            if any(icd10_mapper[icd10_mapper['CODE'].str.contains(code)]):
                ind = icd10_mapper[icd10_mapper['CODE'].str.contains(code)].index
                print(ind)
                text = icd10_mapper.loc[ind, 'SHORT DESCRIPTION']
                print(f"text {text}")
            else:
                text = 'NaN'
            row_ls.append(text)
        principal_text_ls.append(row)
        print(principal_text_ls)
    else:
        row_ls = []
        print(code)
        ind = icd10_mapper[icd10_mapper['CODE'].str.contains(code)].index[0]
        print(ind)
        text = icd10_mapper.loc[ind, 'SHORT DESCRIPTION']
        print(f"text {text}")
        row_ls.append(text)
    principal_text_ls.append(row)
    print(principal_text_ls)
    
    # principal_df = pd.DataFrame.from_records(principal_text_ls)
    # test = pd.concat([df_patient_test, principal_df], axis=1)


['C50929', 'J45909', 'J209', 'C50919']
C50929
Index([1526], dtype='int64')
text 1526    Malignant neoplasm of unsp site of unspecified...
Name: SHORT DESCRIPTION, dtype: object
J45909
Index([10724], dtype='int64')
text 10724    Unspecified asthma, uncomplicated
Name: SHORT DESCRIPTION, dtype: object
J209
Index([10625], dtype='int64')
text 10625    Acute bronchitis, unspecified
Name: SHORT DESCRIPTION, dtype: object
C50919
Index([1523], dtype='int64')
text 1523    Malignant neoplasm of unsp site of unspecified...
Name: SHORT DESCRIPTION, dtype: object
[patient_medicare_number                                                  1S00E00AA53
combined_principal_diagnosis_ls    [C50929, J45909, C50929, C50929, C50929, J4590...
principal_unq                                         [C50929, J45909, J209, C50919]
principal_text                                                                   nan
Name: 168, dtype: object]
[patient_medicare_number                                                  1S

In [185]:
df_patient_test.head()

Unnamed: 0,patient_medicare_number,combined_principal_diagnosis_ls,principal_unq,principal_text
1,1S00E00AA10,"[O039, O039, O039, B002, B002, B085, S8290X, J...","[J0190, B085, S8290X, O039, B002]",
18,1S00E00AA16,"[E785, E785, E785, E785, B085, E785, E785, J01...","[J0190, E785, B085]",
35,1S00E00AA23,"[J329, E785, J329, J029, J029, J329, J329, J32...","[J029, J209, E785, J329]",
64,1S00E00AA25,"[E669, J0190, J0190, J329, J329, J329, J329, J...","[E669, J0190, S72009, J329, Z3400, J029]",
89,1S00E00AA32,"[I10, J209, J209, J329, J0390, J209, J209, J20...","[I10, J0390, J209, J329]",


In [None]:
#Instantiate wordcloud object and use method to feed it our corpus
wc = WordCloud().generate_from_text(rome_corpus)

#Use matplotlib.pyplot to display the fitted wordcloud
#Turn axis off to get rid of axis numbers
plt.imshow(wc)
plt.axis('off')
plt.show()