In [1]:
import pandas as pd
import re
import seaborn as sns
import glob
import cloudpickle
from matplotlib import pyplot as plt
%matplotlib inline

In [11]:
disease_lookup = cloudpickle.load(open("disease_lookup.pkl", "rb" ) )
disease_list = cloudpickle.load(open("disease_list.pkl", "rb" ) )

disease_list=[{'name':a[0],'clin_ref':a[1],'synonyms':a[2],\
               'wiki_ref':'','clin_freq':0,'wiki_freq':0,'forum_freq':0,'doctor_freq':0}\
              for a in disease_list]

for dis in disease_list:
    for k in dis['synonyms']:
        if(len(k)<len(dis['name'])):
            dis['name']=k

In [12]:
allFiles = glob.glob("diseasedata/nndss/NNDSS_-_Table_II*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    df.columns = [col.strip() for col in df.columns]
    df=df.loc[(df['Reporting Area']=='UNITED STATES') & (df['MMWR Week']==37),:]
    c=[col for col in df.columns if "Cum 2015"in col and not "flag" in col]
    v=df[c].values[0]
    df_to=pd.DataFrame({'Disease': [col.split(',')[0].split('(')[0].split('\xc2')[0] for col in c], 'Total cases reported  2015' :v})
    list_.append(df_to)

In [13]:
df_infreq=pd.read_csv("diseasedata/nndss/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases.csv",index_col=None, header=0)
list_.append(df_infreq[['Disease','Total cases reported  2015']].dropna())
df_reports = pd.concat(list_)
df_reports.columns=['Disease','Count']
df_reports['Disease']=df_reports['Disease'].apply(lambda x: x.lower())

In [14]:
from nltk import ngrams

def get_toks(txt,ngram_length):
    return [item for sublist in \
             [[" ".join(grams) for grams in ngrams(txt.split(), n)] \
             for n in range(ngram_length)] \
             for item in sublist]

def get_dislist_simple(diseases,text):
    choptext=dict([[a,0] for a in get_toks(text,4)])
    return [k for k in diseases if k in choptext]

for i,repo in enumerate(df_reports['Disease'].values):
    if(i%1000==0):
        print i
    txt = df_reports.loc[df_reports['Disease']==repo,'Disease'].values[0].lower()
    cnt = df_reports.loc[df_reports['Disease']==repo,'Count'].values[0]
    alldiags = get_dislist_simple(disease_lookup.keys(),txt)
    for k in set([disease_lookup[x] for x in alldiags]):
        disease_list[k]['clin_freq']=disease_list[k]['clin_freq']+cnt

0
1000


In [15]:
with open("disease_list_stats.pkl", 'wb') as handle:
    cloudpickle.dump(disease_list, handle)

In [16]:
import cloudpickle
all_mentions = cloudpickle.load(open("alltext_labels.pkl", "rb" ) )
doc_mentions = cloudpickle.load(open("doctortext_labels.pkl", "rb" ) )
disease_list = cloudpickle.load(open("disease_list_stats.pkl", "rb" ) )


In [17]:
allment=all_mentions.sum(axis=0).tolist()[0]
docment=doc_mentions.sum(axis=0).tolist()[0]

for (i,m) in enumerate(allment):
    disease_list[i]['forum_freq']=disease_list[i]['forum_freq']+m 
for (i,m) in enumerate(docment):
    disease_list[i]['doctor_freq']=disease_list[i]['doctor_freq']+m 

In [18]:
with open("disease_list_stats.pkl", 'wb') as handle:
    cloudpickle.dump(disease_list, handle)

In [20]:
from tabulate import tabulate
topdoc = [a['name'] for a in sorted(disease_list, key=lambda x: -x['doctor_freq'])[:5]]
topfor = [a['name'] for a in sorted(disease_list, key=lambda x: -x['forum_freq'])[:5]]
topclin = [a['name'] for a in sorted(disease_list, key=lambda x: -x['clin_freq'])[:5]]
print tabulate(zip(topdoc,topfor,topclin),headers=('most frequent doctor','most frequent user','most frequent clinical'))

most frequent doctor    most frequent user    most frequent clinical
----------------------  --------------------  ------------------------
kidney disease          kidney disease        chlamydia
candida infection       burns                 ra
hair loss               aids                  gerd
latex allergy           hair loss             acne
bleeding disorders      adenomyosis           aids


In [21]:
sorted(disease_list, key=lambda x: -x['clin_freq'])

[{'clin_freq': 1072139.0,
  'clin_ref': 'http://www.cdc.gov/std/chlamydia/default.htm',
  'doctor_freq': 123.0,
  'forum_freq': 217.0,
  'name': 'chlamydia',
  'synonyms': ['chlamydia', 'chlamydia trachomatis disease'],
  'wiki_freq': 0,
  'wiki_ref': ''},
 {'clin_freq': 3154.0,
  'clin_ref': 'http://www.cdc.gov/flu/avianflu/',
  'doctor_freq': 158.0,
  'forum_freq': 195.0,
  'name': 'ra',
  'synonyms': ['avian influenza',
   'influenza',
   'flu',
   'pandemic flu',
   'seasonal flu',
   'h3n2v influenza',
   'haemophilus influenzae infection',
   'haemophilus influenzae serotype b',
   'human parainfluenza viruses',
   'h1n1 flu',
   'swine influenza',
   'h7n9 influenza',
   'influenza and cancer',
   'influenza in pigs',
   'pandemic influenza',
   'parainfluenza',
   'ra',
   'rabies',
   'vfc program',
   'vibration',
   'viral meningitis',
   'vulnerable'],
  'wiki_freq': 0,
  'wiki_ref': ''},
 {'clin_freq': 0,
  'clin_ref': 'http://www.mayoclinic.org/diseases-conditions/disease