In [15]:
#Function that predicts ICD-10 diagnostic label
#Representation: Naive Bayes, defined Y=arg max P(Y=yi)DotProd(P(Xi|yi)*P(Xi))
#Evaluation: AUC (combination of TPR+FPR). Good for balanced classes.
#Predicted label is a ICD-10 insurance code and description (e.g. "A0100", "Cholera due to Vibrio cholerae 01, biovar cholerae")
#Input space is a multinomial bag of n-grams taken from ICD-10 diagnostic descriptions, and weighted by nl(IDF) to favor rare words

In [16]:

#other functions used in alternative versions of app
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.model_selection import GridSearchCV
# from sklearn.feature_extraction.text import CountVectorizer
# from scipy import sparse

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from nltk.tokenize import RegexpTokenizer
# import swifter
# from nltk.metrics.distance import jaccard_distance
# from nltk.util import ngrams
# from nltk.tokenize import word_tokenize
# import psutil
# import memory_profiler
# import sys
# import pandas as pd

########CURRENT APP VERSION #########
from pandas import read_csv
import numpy as np
from numba import jit
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


In [1]:
print(globals().keys())
print(locals().keys())

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1'])
dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1'])


In [3]:
#multinomial naive bayes model training
tfidf_model = TfidfVectorizer(ngram_range=(2,2),lowercase = True,analyzer='char_wb')
X_train = tfidf_model.fit_transform(data['description'])
#add 1 to every term x document position (prevents 0 probability problem)
X_train = X_train.toarray()+1
#transform back to sparse matrix
X_train = sparse.csr_matrix(X_train)

y_train = data['code']

MNB_model = BernoulliNB(fit_prior=False,binarize=.5).fit(X=X_train,y=y_train)

NameError: name 'data' is not defined

In [3]:
print(data)
print(tfidf_model.vocabulary_)
print(X_train.toarray())

        code                                        description
0       code                                        description
1       A000  Cholera due to Vibrio cholerae 01 biovar cholerae
2       A009                                Cholera unspecified
3      A0100                          Typhoid fever unspecified
4      A0101                                 Typhoid meningitis
...      ...                                                ...
11378  Z9912  Encounter for respirator [ventilator] dependen...
11379   Z992                       Dependence on renal dialysis
11380   U070                            Vaping-related disorder
11381   U071                                           COVID-19
11382   U099                Post COVID-19 condition unspecified

[11383 rows x 2 columns]
{' d': 14, 'de': 312, 'es': 355, 'sc': 677, 'cr': 296, 'ri': 654, 'ip': 450, 'pt': 630, 'ti': 723, 'io': 449, 'on': 596, 'n ': 543, ' c': 13, 'ch': 288, 'ho': 422, 'ol': 594, 'le': 500, 'er': 354, 'ra': 646

In [4]:
#Model saving
with open('MNB_model', 'wb') as outp:
    pickle.dump(MNB_model, outp, pickle.HIGHEST_PROTOCOL)
    
with open('tfidf_model', 'wb') as outp:
    pickle.dump(tfidf_model, outp, pickle.HIGHEST_PROTOCOL)

NameError: name 'MNB_model' is not defined

In [8]:
#model & data loading

data = pd.read_csv(
    filepath_or_buffer = '/Users/seanmiller/Downloads/Untitled spreadsheet - icd10cm_codes_2022 (1).csv'
    ,sep=','
    ,engine='python'
    ,names=['code','description'])

with open('MNB_model', 'rb') as outp:
    MNB_model = pickle.load(outp)
    
with open('tfidf_model', 'rb') as outp:
    tfidf_model = pickle.load(outp)

EOFError: Ran out of input

In [12]:
#prediction
# sample_data = np.array(['Central Euopean tick-rne encephalitis', #A841
#                         'Cntral Euopean tick-rne encephalitis',
#                         'Central tick-rne encephalitis',
#                         'Central encephalitis',
#                         'Cenoral encaphalitis',
#                        'Shigellosis due to Shagella dysenteriae', #A030
#                        'Shigellosis due to dysenteriae',
#                        'Shigellosis due to Shagella dasenteriae',
#                        'Shigellosis dysenteriae',
#                        'Shigellosis dasenteriae',])

def diag_predict(text):
	try:
		text = np.array([text])
		X_test = tfidf_model.transform(text)
		#add 1 to every term x document position (prevents 0 probability problem)
		X_test = X_test.toarray()+1
		#transform back to sparse matrix
		X_test = sparse.csr_matrix(X_test)
		y_test_predicted = MNB_model.predict(X_test)[0]
		return y_test_predicted, data[data['code']==y_test_predicted]['description'].iloc[0]
	except Exception as err:
		return str(err)
    

In [84]:
#model testing
text='Shagella Shigellosis dasenteriae treatment in the right to own a pillow for sure'
text='Sepsis due to unspecified staphylococcus'
#text='Dislocated jaw'

In [11]:
icd10_code, icd10_description = diag_predict(text)
print(icd10_code)
print(icd10_description)

NameError: name 'diag_predict' is not defined

In [54]:
#alternative: jaccard distance on documents

def diag_predict_jacc(text):
    jacc = []
    gram_num = 2
    rows,columns = data.shape
    for i in range(0,rows):
        jacc.append(jaccard_distance(set(ngrams(data['description'].iloc[i],gram_num)),
                                     set(ngrams(text,gram_num))))
    data['jacc'] = jacc
    data_sorted = data.sort_values(by='jacc',ascending=True).iloc[0]
    print(data_sorted)
    return data_sorted['code'],data_sorted['description']


In [55]:
print(text)
diag_predict_jacc(text)


Dislocated jaw
code                           I401
description    Isolated myocarditis
jacc                           0.72
Name: 3202, dtype: object


('I401', 'Isolated myocarditis')

In [64]:

#alternative: jaccard distance on words prediction
#[CONCLUSION]: Works for short input strings. For long ones, becomes much less accurate and computationally infeasible.

#making more efficient:
# - skip label if first ~2 train tokens produce >= x distance 
# - skip label if longest word has >= x distance to any test word
# - remove meaningless words and symbols from both train and test set

#tokenize the train words just once on page load
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')

# A function that returns the length of the value:
def set_count(s):
    return len(s)

def list_of_tokens_as_sets(text):
    set_list = [set(w) for w in tokenizer.tokenize(text) if not w.lower() in stop_words]
    set_list.sort(key=set_count,reverse=True)
    return set_list
    
data['tokens'] = data.apply(lambda row: list_of_tokens_as_sets(row['description']), axis=1)
#function for calculating avg of best matches of train against test, given a single training observation
def avg_min_dist(tokens_train,tokens_test):
    tokens_train_min_distance = []
    for i, single_train in enumerate(tokens_train):
        jacc = []
        #and for each token in the test data...
        for single_test in tokens_test:
            #check distance (note train is already a set)
            jacc.append(jaccard_distance(single_test,single_train))
        #find best match
        min_match = np.nanmin(jacc)
        #if it's the longest word in training data
        if i ==1:
            #append the distance if it's a good match
            if min_match <= .2:
                tokens_train_min_distance.append(np.nanmin(jacc))
                continue
            #otherwise, break loop and don't consider as a potential winner if not good
            else:
                tokens_train_min_distance.append(1)
                break
        #if not longest word in training data, continue calculating as normal
        else:
            tokens_train_min_distance.append(np.nanmin(jacc))
    return np.nanmean(tokens_train_min_distance)


def diag_predict_jacc_w(text):
    tokens_test = list_of_tokens_as_sets(text)
    jacc = []
    rows, columns = data.shape
    data['avg_min_jacc'] = data.swifter.allow_dask_on_strings(enable=True).apply(lambda row: avg_min_dist(row['tokens'],tokens_test), axis=1)
    #data['avg_min_jacc'] = data['tokens'].swifter.apply(lambda tokens_train: avg_min_dist(tokens_train,tokens_test))
    #data['avg_min_jacc'] = data.apply(lambda row: avg_min_dist(row['tokens'],tokens_test), axis=1)
    data_sorted = data.sort_values(by='avg_min_jacc',ascending=True).iloc[0]
    #print(data.sort_values(by='avg_min_jacc',ascending=True).head())
    return data_sorted['code'],data_sorted['description']

In [65]:
%%time

#text='Shagella Shigellosis dasenteriae treatment in the right to own a pillow for sure'
text='Acquired absence of ovaries bilateral'
text='Acquired absence of other genital organ(s)'
diag_predict_jacc_w(text)

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

CPU times: user 1.01 s, sys: 263 ms, total: 1.27 s
Wall time: 8.29 s


('Z9079', 'Acquired absence of other genital organ(s)')

In [59]:
#Alternative: One sided jaccard - existence of train words in test; csm_matrix for sparse data to allow upload to gcloud
#Conclusion: Numba paralellization needs to be turned off due to lack of compatability with csm_matrix package; this makes
#the algorithm much less efficient

tfidf_model = TfidfVectorizer(ngram_range=(3,3),lowercase = True,analyzer='char_wb')
X_train = tfidf_model.fit_transform(data['description'])

#save bag of words model and training data

with open('tfidf_model', 'wb') as out1, open('X_train', 'wb') as out2:
    pickle.dump(tfidf_model, out1, pickle.HIGHEST_PROTOCOL)
    pickle.dump(X_train, out2, pickle.HIGHEST_PROTOCOL)
    
    
with open('tfidf_model', 'rb') as out1, open('X_train', 'rb') as out2:
    tfidf_model = pickle.load(out1) 
    X_train = pickle.load(out2)

#utility: similarity measurement; paralellized
#@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
def similarity_func(X_train,X_test): # Function is compiled to machine code when called the first time
    pct_train_shared = []
    rows, columns = X_train.shape
    for i in range(0,rows):   # Numba likes loops
        #transform each row of X_train to dense matrix to allow numpy pairwise conditional logic
        X_train_row_dense = X_train.getrow(i).toarray()
        #append similarity measure: Both > 0 / Train(i) > 0
        pct_train_shared.append((np.sum((X_train_row_dense > 0) & (X_test > 0)) / np.sum(X_train_row_dense > 0)))  # Numba likes NumPy functions
    return pct_train_shared

#each time space is pressed
def diag_predict_y_in_x(text):
    #transform test data into bag of words (as csm)
    text = np.array([text])
    X_test = tfidf_model.transform(text).toarray()
    #run similarity func
    data['similarity'] = similarity_func(X_train,X_test)
    data_sorted = data.sort_values(by='similarity',ascending=False)
    return data_sorted['code'].iloc[0],data_sorted['description'].iloc[0]


In [60]:
%%time
texts = ['Huan immunoficiency virus [HIV] disease and patient was earnest in telling me that their grandmother was sick before she got there',
'Cytomgaloviral pneumnitis and patient was earnest in telling me that their grandmother was sick before she got there',
'Cytomegaoviral hepaitis and patient was earnest in telling me that their grandmother was sick before she got there',
'Cytomgaloviral pacreatitis and patient was earnest in telling me that their grandmother was sick before she got there',
'Other ctomegaloviral diseases and patient was earnest in telling me that their grandmother was sick before she got there',
'Cytomegloviral isease unpecified and patient was earnest in telling me that their grandmother was sick before she got there',
'Mumps orhitis and patient was earnest in telling me that their grandmother was sick before she got there',
'Mups menngitis and patient was earnest in telling me that their grandmother was sick before she got there']

for j in texts:
    print(diag_predict_y_in_x(j))



('B20', 'Human immunodeficiency virus [HIV] disease')
('B250', 'Cytomegaloviral pneumonitis')
('B251', 'Cytomegaloviral hepatitis')
('B252', 'Cytomegaloviral pancreatitis')
('B258', 'Other cytomegaloviral diseases')
('B259', 'Cytomegaloviral disease unspecified')
('B260', 'Mumps orchitis')
('B261', 'Mumps meningitis')
CPU times: user 32.6 s, sys: 108 ms, total: 32.7 s
Wall time: 32.7 s


In [1]:
#FINAL PART 1: LOADING / SAVING
from pandas import read_csv
import numpy as np
from numba import jit
import pickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer


#generate initial data and bag of words model
data = read_csv(
    filepath_or_buffer = './static/icd10_all_codes.csv'
    ,sep=','
    ,engine='python')

tfidf_model = TfidfVectorizer(ngram_range=(4,4),lowercase = True,analyzer='char_wb')
X_train = tfidf_model.fit_transform(data['description'])

#save bag of words model and training data

with open('tfidf_model', 'wb') as out:
    pickle.dump(tfidf_model, out, pickle.HIGHEST_PROTOCOL)
    
with open('X_train', 'wb') as out:
    pickle.dump(X_train, out, pickle.HIGHEST_PROTOCOL)
    
with open('data', 'wb') as out:
    pickle.dump(data, out, pickle.HIGHEST_PROTOCOL)

#load data

def load_data():
    if ('X_train_csm_column' not in globals().keys()) & ('X_train_index_pointer' not in globals().keys()):
        with open('X_train', 'rb') as out2:
            X_train  = pickle.load(out2)
            globals()['X_train_csm_column'] = np.array(X_train.indices)
            globals()['X_train_index_pointer'] = np.array(X_train.indptr)
            del X_train
    if 'tfidf_model' not in globals().keys():
        with open('tfidf_model', 'rb') as out1:
            globals()['tfidf_model'] = pickle.load(out1) 
        #transform test data into bag of words (as csm)
    if 'data' not in globals().keys():
        with open('data', 'rb') as out2:
            globals()['data']  = pickle.load(out2)
    from sklearn.feature_extraction.text import TfidfVectorizer
            
load_data()


In [5]:
#FINAL PART 2: USING SPARSE + PARALELLIZATION by implementing a custom numpy based sparse matrix
#THIS CELL IS FUNCTIONS THAT RUN JUST IN TIME IN WEB APP
@jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
def similarity_func(X_train_csm_column
                    ,X_train_index_pointer
                    ,X_train_sparse_row_num
                    ,X_test): # Function is compiled to machine code when called the first time
    
    #initialize storage
    X_train_shared_tokens_compressed = []
    X_train_shared_tokens = []
    X_train_total_tokens = []
    
    #create 1d array of length equal to compressed matrix rows containing 1 if that row (> 0 x_train token) has a match in test
    for X_train_col in X_train_csm_column:   # Numba likes loops
        X_train_shared_tokens_compressed.append(X_test[X_train_col] > 0)
        
    #create 1d array of length equal to original matrix rows containing sum of total and intersecting tokens
    for row_sparse in range(X_train_sparse_row_num):
        total = 0
        intersections = 0
        #for each pointer, iterate through the compressed matrix to add up total & total intersections per row
        for row_compressed in range(X_train_index_pointer[row_sparse],X_train_index_pointer[row_sparse+1]):
            intersections += X_train_shared_tokens_compressed[row_compressed]
            total += 1
        #then insert within arrays that have length of sparse matrix rows
        X_train_total_tokens.append(total)
        X_train_shared_tokens.append(intersections)
        #then return similarity
    return (np.array(X_train_shared_tokens) / np.array(X_train_total_tokens), np.array(X_train_total_tokens))

#each time space is pressed
def diag_predict_y_in_x(text):
    #load global variables as needed
    #load_data()
    X_test = tfidf_model.transform(np.array([text])).toarray()
    tr,tc = X_test.shape
    X_test = X_test.reshape(tc,)
    #create a numpy friendly 3 array compressed sparse matrix
    X_train_sparse_row_num = len(data['code']) #index pointer has length 1 longer than rows in sparse matrix
    #load data and run similarity
    data['similarity'], data['total_tokens'] = similarity_func(X_train_csm_column
                                                               ,X_train_index_pointer
                                                               ,X_train_sparse_row_num
                                                               ,X_test) 
    data['similarity_adjusted'] = data['similarity']*(np.log(data['total_tokens']+10) / np.log(2))
    #sort data twice in output (prioritizing memory over processing)
    return data.sort_values(by='similarity_adjusted',ascending=False)['code'].iloc[0],data.sort_values(by='similarity_adjusted',ascending=False)['description'].iloc[0]

In [8]:
%%time
texts = ['Patient in car accident with dislocation at left side of jaw...'
        ,'Dislocation of jaw ' #should be Dislocation of jaw unspecified side initial encounter
        ,'Pityriasis versicolor'
        ,'Patient has rash on chest suggesting Pityriasis versicolor'
        ,'Papillomavirus causing disease...' #should be Pityriasis versicolor'
        ,'Papillomavirus as the cause of diseases classified elsewhere' #should be Pityriasis versicolor
        ,'Patient has a Malignant neoplasm within digestive system...'
        ,'Malignant neoplasm of ill-defined sites within the digestive system'
        ,'Other injury of extensor muscle fascia'
        ,'Other injury of extensor muscle fascia and tendon of left middle finger at wrist and hand level initial encounter'
        ,'Vasectomy status'
        ,'The patient has a positive Vasectomy status'
        ,'Cholera unspecified'
        ,'Saw patient with Cholera unspecified '
        ,'Gonnocal infection ' #should be
        ] 

for j in texts:
    print(diag_predict_y_in_x(j))
    
    


('S0302XS', 'Dislocation of jaw left side sequela')
('S0302XS', 'Dislocation of jaw left side sequela')
('B360', 'Pityriasis versicolor')
('B360', 'Pityriasis versicolor')
('B0802', 'Orf virus disease')
('B977', 'Papillomavirus as the cause of diseases classified elsewhere')
('C269', 'Malignant neoplasm of ill-defined sites within the digestive system')
('C269', 'Malignant neoplasm of ill-defined sites within the digestive system')
('M62838', 'Other muscle spasm')
('S66393A', 'Other injury of extensor muscle fascia and tendon of left middle finger at wrist and hand level initial encounter')
('Z9852', 'Vasectomy status')
('Z9852', 'Vasectomy status')
('A009', 'Cholera unspecified')
('A009', 'Cholera unspecified')
('T86832', 'Bone graft infection')
CPU times: user 1.11 s, sys: 23.6 ms, total: 1.13 s
Wall time: 1.13 s
