In [1]:
## Summarize a medical history by aggregating the CCS descriptions of all their diagnoses and perform NLP techniques to simplify and produce useful and brief summaries
### - Techniques include removal of special characters, numbers, stop words, tokenizing, and "lemmatizing". The results are then aggregated to a user after removing non-essential codes (e.g. common medical evaluation) from the history
## Lleverage a KNN based recommender engine where drugs will be seen as "users" and diagnoses and "items" in terms of the user-item matrix used in collaborative filtering.
### - The idea here being a that a medical summary of an individual can be thought of as complementary, or a "recommendation", of drug history.
### - Commonly, explicit feedback recommender engines call for a "rating" (e.g. person's rating for a movie). This will be replicated in this use case by getting the frequency of drug category-CCS
### - combination appearing in a same day for a user, then scaled and rounded to an integer value

In [None]:
#!python -m spacy download en
#!pip install scikit-surprise

In [1]:
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# get NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import en_core_web_sm

In [188]:
#Read in data
ccs = pd.read_csv("ccs.csv")
claims = pd.read_csv("claim_lines.csv")
claims['diag'] = claims['diag1'].str.replace('.', '')
drugs = pd.read_csv("prescription_drugs.csv")

# Join together and use ccs_3_desc as main description of diagnosis. This provides the most human-readable explanation
mems = claims.merge(ccs, on = 'diag', how = 'inner')[['member_id','diag','diag_desc','ccs_1_desc','ccs_2_desc', 'ccs_3_desc']]
mems['main_desc'] = mems.ccs_3_desc.str.lower().str.replace(";","").str.replace("/"," ").str.replace(":"," ")

In [189]:
# My approach here is to summarize the CCS and concisely and insightfully as possible

nlp = en_core_web_sm.load()
# tokenize CCS descriptions, top words lemming, etc to get key words
def nlp_pipeline(text):
    # Remove numbers and special characters that weren't removed before
    text = text.replace('/', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    #text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.replace('\.', ' ')

    # Tokenization
    doc = nlp(text)
    tokens = [token.text for token in doc]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatization to remove participles but make it human readable
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

In [190]:
mems2 = mems[['member_id','main_desc']].drop_duplicates()

#Find most common descriptions. Eliminate 5% most common since they aren't as important to know from a clinical standpoint e.g. annual physical, common cold, hypertension
ccs_summ = mems2.groupby('main_desc')['member_id'].count().reset_index().sort_values('member_id', ascending = True)
ccs_summ_2=ccs_summ.head(round(ccs_summ.count()[0]*.95))
ccs_summ_2['main_desc_new']= ccs_summ_2['main_desc'].apply(nlp_pipeline)
#combine all of a patients CCS's into one for complete history
mems3 = mems2.merge(ccs_summ_2[['main_desc','main_desc_new']], on = 'main_desc')
summ_df = mems3.groupby('member_id')['main_desc_new'].agg(lambda x: ', '.join(x)).reset_index()


  ccs_summ_2=ccs_summ.head(round(ccs_summ.count()[0]*.95))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ccs_summ_2['main_desc_new']= ccs_summ_2['main_desc'].apply(nlp_pipeline)


In [191]:
#Print the first 100 patients summaries to verify it's brief yet informative
for i in summ_df['main_desc_new'].values[0:100]:
    print(i)

menstrual disorder, complication pregnancy, previous c - section, pregnancy delivery including normal, liveborn
adjustment disorder, nonspecific chest pain, coronary atherosclerosis heart disease
acquired foot deformity
headache including migraine
nonspecific chest pain, thyroid disorder, disorder lipid metabolism
menstrual disorder, complication pregnancy, pregnancy delivery including normal, liveborn, complication birth puerperium affecting management mother, prolonged pregnancy, congenital anomaly, inflammation infection eye   except caused tuberculosis sexually transmitteddisease
nonmalignant breast condition
diabetes mellitus without complication
joint disorder dislocation trauma - related
menstrual disorder, pregnancy delivery including normal, ovarian cyst
nutritional deficiency
disorder lipid metabolism
headache including migraine, administrative social admission
thyroid disorder, disorder lipid metabolism, malaise fatigue, ear sense organ disorder, viral infection, influenza, 

In [192]:
#Now, we'll begin answering question 2.  Let's re-build our data to include dates so that we can tie a diagnosis to a drug via a visit. Summarize drug using drug category
journey = claims.merge(drugs[['drug_category','member_id','date_svc']].drop_duplicates(),on =['member_id','date_svc']).merge(ccs[['diag','ccs_3_desc']].drop_duplicates(), 
                                                                                                                             on ='diag')
journey['main_desc'] = journey.ccs_3_desc.str.lower().str.replace(";","").str.replace("/"," ").str.replace(":"," ")

In [193]:
journey2 = journey.merge(ccs_summ_2[['main_desc','main_desc_new']])

In [194]:
journey3 = journey2[['member_id','drug_category','main_desc_new']].drop_duplicates()

In [195]:
#See how common each combo is. This will be basis for our "rating" system that will be fed though a recommender engine
all_df_2 = journey3.groupby(['drug_category', 'main_desc_new'])['member_id'].count().reset_index()
all_df_2 = all_df_2.rename(columns = {"member_id":"count"})
#Convert frequency to ratings, out of 100
all_df_2['drug_Sum'] = all_df_2.groupby(['drug_category'])['count'].transform('sum')
all_df_2['rating'] = ((all_df_2['count'] / all_df_2.drug_Sum) * 100) + 1
all_df_2['rating'] = all_df_2['rating'].astype(int)

In [196]:
#For all combos that dont exist, we still want to include them as negative feedback for recommender engine
from itertools import product
drugs_all = drugs[['drug_category']].drop_duplicates()
ccs_all = ccs_summ_2[['main_desc_new']].drop_duplicates()

# Get the Cartesian product
cartesian_product = list(product(drugs_all['drug_category'], ccs_all['main_desc_new']))

# Create a df from the Cartesian product
result_df = pd.DataFrame(cartesian_product, columns=['drug_category', 'main_desc_new'])

In [197]:
#Create final drug, ccs, frequency matrix. Which we'll re-label as 'user', 'item', 'rating'
cf_df = all_df_2.merge(result_df, on =['drug_category', 'main_desc_new'], how = 'right').fillna(0)[['drug_category','main_desc_new', 'rating']]
cf_df['rating'] = cf_df['rating'].astype(int) + 1
cf_df.columns = ['user', 'item', 'rating']

In [173]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(1, 101))

# Load the DataFrame into a Surprise Dataset object
dataset = Dataset.load_from_df(cf_df, reader)

In [174]:
# Retrieve the trainset.
trainset = dataset.build_full_trainset()

# Build an algorithm, and train it.
#For simplicity, use KNN distance
algo = KNNBasic()
algo.fit(trainset)

# Then predict ratings for all pairs
# Here, we test on train as a sanity check that it's working
predictions = algo.test(trainset.build_testset())



Computing the msd similarity matrix...
Done computing similarity matrix.


In [186]:
from collections import defaultdict

#This is the actual recommender engine. In practice, a user of this algo can feed in a series of drugs of they've used, and infer their health statuses/conditions
def get_top_n(predictions, n=5):
    """Return the top-N recommendation for each user from a set of predictions.
"""
    # First map the predictions to each drug.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each drug and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=5)

# Print sample health statuses based on a drug category
for uid, user_ratings in top_n.items():
    if uid in ['Antidepressants','Vaginal Products','Urinary Anti-Infectives','Laxatives','Adhd/Anti-narcolepsy/Anti-obesity/Anorexiants','Dermatologicals']:
        print(uid, [iid for (iid, _) in user_ratings])

Vaginal Products ['inflammatory disease female pelvic organ', 'mycoses', 'urinary tract infection', 'disorder lipid metabolism', 'diabetes mellitus without complication']
Urinary Anti-Infectives ['urinary tract infection', 'genitourinary symptom ill - defined condition', 'diabetes mellitus without complication', 'disorder lipid metabolism', 'diabetes mellitus complication']
Laxatives ['gastrointestinal disorder', 'disorder lipid metabolism', 'diabetes mellitus without complication', 'diabetes mellitus complication', 'esophageal disorder']
Antidepressants ['disorder lipid metabolism', 'diabetes mellitus without complication', 'anxiety disorder', 'mood disorder', 'diabetes mellitus complication']
Dermatologicals ['allergic reaction', 'diabetes mellitus without complication', 'disorder lipid metabolism', 'skin subcutaneous tissue infection', 'diabetes mellitus complication']
Adhd/Anti-narcolepsy/Anti-obesity/Anorexiants ['attention - deficit conduct disruptive behavior disorder', 'disorde