In [None]:
import pandas as pd
import numpy as np

# train test split
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

#unsupervised learning methods
# Feature agglomeration uses agglomerative(or hierarchical) clustering to group similar features, so it has its own dimensionality reduction technique
from sklearn.cluster import KMeans, AgglomerativeClustering, FeatureAgglomeration, DBSCAN
from sklearn.mixture import GaussianMixture

# dimensionality reduction methods
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# model selection
from sklearn.model_selection import GridSearchCV

# model scores
from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabasz_score
import setuptools
from yellowbrick.cluster import SilhouetteVisualizer

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image


In [None]:
# readin in patient level file label encoded
path = "../data/clean"
df_lab_enc = pd.read_pickle(f"{path}/patient_level_lab_enc.pkl")
df = pd.read_pickle(f"{path}/patient_level_features.pkl")

In [None]:
# readin icd10 mapper
mapper_path = "../data/mappers"
icd10_mapper = pd.read_pickle(f"{mapper_path}/icd10.pkl")

Word Cloud 

In [None]:
df_patient = pd.read_pickle(f"{path}/patient_level.pkl")

In [None]:
# merge primary diagnosis column back onto data now that it has been through learning
df_patient = df_patient[['patient_medicare_number', 'age', 'combined_principal_diagnosis_ls', 'combined_hcpcs_ls']]
# dropping data so the dataset is the same as the end dataset in 04_created_unsupervised_features
df_patient = df_patient[df_patient['age'].notnull()]
df_patient['ls_len'] = df_patient['combined_hcpcs_ls'].str.len()
df_patient = df_patient[df_patient['ls_len'] < 1000]
df_unsupervised = pd.concat([df_patient.reset_index(drop=True), df.reset_index(drop=True)], axis = 1)

In [None]:
df_unsupervised.tail()

In [None]:
df_unsupervised['principal_unq'] = df_unsupervised['combined_principal_diagnosis_ls'].apply(lambda x: list(set(x)))

In [None]:
df_unsupervised.head()

In [None]:
# create work cloud column of diagnosis descriptions

for index, row in df_unsupervised.iterrows():

    #print(row['principal_unq'])
    if len(row['principal_unq']) > 1:
        row_ls = []
        for code in row['principal_unq']:
            #print(code)
            if icd10_mapper['CODE'].isin([code]).any():
                ind = icd10_mapper[icd10_mapper['CODE'].str.contains(code)].index[0].item()
                #print(ind)
                text = icd10_mapper.loc[ind, 'SHORT DESCRIPTION'].split(' ')[0:4]
                #print(f"text {text}") 
            #row_ls.append(text)
            #print(text)
            row_ls = row_ls + text
        #print(row_ls)
        #row_ls = [ word for word in row_ls if word != ',']
        row_ls = [ word.strip(", '") for word in row_ls if word.strip(", '") not in ['unspecified', 'Unspecified', 'unsp', 'the', 'as', 'of', 'or', 'w', 'w/o', 'in', 'Acute', 'Chronic', 'Essential', '(primary)']]
        #print(row_ls)
        df_unsupervised.at[index, 'word_cloud'] = str(set(row_ls))
        # df_unsupervised.at[index, 'word_cloud']= pd.Series([row_ls] * len(df_unsupervised))
        # df_unsupervised.assign(word_cloud =  [row_ls for i in df_unsupervised.index])

    else:
        #print(code)
        if icd10_mapper['CODE'].str.contains(code).any():
            ind = icd10_mapper[icd10_mapper['CODE'].str.contains(code)].index[0].item()
            #print(ind)
            text = icd10_mapper.loc[ind, 'SHORT DESCRIPTION'].split( )[0:4]
            text = [ word for word in text if word not in ['unspecified', 'Unspecified', 'as', 'of', 'or', 'w', 'w/o', 'Acute', 'Chronic', 'Essential', '(primary)']]
            print(f"text {text}")
            df_unsupervised.at[index, 'word_cloud'] = str(set(text))

In [None]:
# Percentage breakdown
vals = df_unsupervised['word_cloud'].value_counts(normalize=True) * 100
pd.DataFrame({
  'age_breakdown': vals
}).head(5)

In [None]:
df_unsupervised.tail()

In [None]:
n_clusters = 9
for cluster in range(0,n_clusters):
    df_cluster = df_unsupervised[df_unsupervised['cluster'] == cluster]
    diag_cloud = df_cluster['word_cloud'].head(100).apply(str).str.cat(sep=', ')
    #Instantiate wordcloud object and use method to feed it our corpus
    wc = WordCloud().generate_from_text(diag_cloud)

    #Use matplotlib.pyplot to display the fitted wordcloud
    #Turn axis off to get rid of axis numbers
    plt.imshow(wc)
    plt.axis('off')
    plt.show()