In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
mort_pubs = pd.read_table('./data/mort_pubs.tsv', sep='\t', encoding='latin-1',
                         dtype={'fips': object})

# drop first column
mort_pubs = mort_pubs.drop(['Unnamed: 0'], 1)

In [6]:
iter_csv = pd.read_table('./data/mapAffil_2016_nonull.tsv', sep='\t', encoding='latin-1',
                         dtype={'PMID': object, 
                                'au_order': np.int64, 
                                'year': np.int64,
                                'type': object, 
                                'city': object, 
                                'state':object, 
                                'country': object,
                                'lat': np.float64, 
                                'lon': np.float64, 
                                'fips': object},
                         iterator=True, chunksize=10**6)

mapAffil = pd.concat([chunk[pd.notnull(chunk['fips'])] for chunk in iter_csv])
mapAffil = mapAffil.drop(['Unnamed: 0'], 1)

### Import mesh data

In [7]:
iter_csv = pd.read_table('./data/MeSH2016.tsv', sep='\t', encoding='latin-1',
                        dtype={'PMID': object},
                        iterator=True, chunksize=10**6)
mesh = pd.concat([chunk for chunk in iter_csv])

In [82]:
len(mesh)

23354735

In [None]:
len(mesh.PMID.unique())

In [9]:
len(mapAffil)

12191076

In [10]:
mesh.head()

Unnamed: 0,PMID,mesh
0,26151948,"Algorithms|Data Mining|Databases, Factual|Gene..."
1,26151946,"Adult|Aged|Aged, 80 and over|Decision Making|F..."
2,26151944,Animals|Biomimetic Materials|Biomimetics|Equip...
3,26151942,Amino Acids|Anti-Bacterial Agents|Escherichia ...
4,26151941,"Action Potentials|Electronics|Humans|Models, N..."


#### Drop rows where PMID is not in mapAffil

In [11]:
mdf = pd.merge(mesh, mapAffil.loc[:,['PMID']].drop_duplicates(), how='inner', on=['PMID'])

In [12]:
mdf.shape

(4984444, 2)

In [None]:
len(mdf.PMID.unique())

In [14]:
mdf['cleantext'] = mdf.loc[:,'mesh'].str.lower().str.split('|')

In [16]:
mdf.loc[:,'mesh_groups'] = None

In [17]:
mdf.head()

Unnamed: 0,PMID,mesh,cleantext,mesh_groups
0,26151925,Adult|Amygdala|Case-Control Studies|Female|Hum...,"[adult, amygdala, case-control studies, female...",
1,26151913,Animals|Cell Differentiation|Gene Expression R...,"[animals, cell differentiation, gene expressio...",
2,26151911,Acoustic Stimulation|Animals|Brain|Conditionin...,"[acoustic stimulation, animals, brain, conditi...",
3,26151906,"Administration, Intranasal|Animals|Area Under ...","[administration, intranasal, animals, area und...",
4,26151904,Animals|Carrier Proteins|Centromere|Chromatin|...,"[animals, carrier proteins, centromere, chroma...",


In [11]:
mdf.to_csv('../data/clean_mesh.tsv', sep='\t', index=False)

In [18]:
mdf_subset = mdf[:500]
pd.set_option('max_colwidth',300)
mdf_subset.head()

Unnamed: 0,PMID,mesh,cleantext,mesh_groups
0,26151925,"Adult|Amygdala|Case-Control Studies|Female|Humans|Male|Neuroimaging|Neurokinin-1 Receptor Antagonists|Phobic Disorders|Piperidines|Positron-Emission Tomography|Receptors, Neurokinin-1|Tetrazoles","[adult, amygdala, case-control studies, female, humans, male, neuroimaging, neurokinin-1 receptor antagonists, phobic disorders, piperidines, positron-emission tomography, receptors, neurokinin-1, tetrazoles]",
1,26151913,"Animals|Cell Differentiation|Gene Expression Regulation|Male|Mice|Mice, Inbred mdx|Mice, Transgenic|MicroRNAs|Muscle, Skeletal|Muscular Dystrophy, Animal|PAX7 Transcription Factor|Physical Conditioning, Animal|Regeneration|Satellite Cells, Skeletal Muscle","[animals, cell differentiation, gene expression regulation, male, mice, mice, inbred mdx, mice, transgenic, micrornas, muscle, skeletal, muscular dystrophy, animal, pax7 transcription factor, physical conditioning, animal, regeneration, satellite cells, skeletal muscle]",
2,26151911,"Acoustic Stimulation|Animals|Brain|Conditioning (Psychology)|Conditioning, Classical|Cues|Dendritic Spines|Electric Stimulation|Extremities|Fear|Male|Mice|Neuronal Plasticity","[acoustic stimulation, animals, brain, conditioning (psychology), conditioning, classical, cues, dendritic spines, electric stimulation, extremities, fear, male, mice, neuronal plasticity]",
3,26151906,"Administration, Intranasal|Animals|Area Under Curve|Central Nervous System|Cerebellar Cortex|Chromatography, High Pressure Liquid|Glutamate Carboxypeptidase II|Glutarates|Half-Life|Injections, Intraperitoneal|Macaca fascicularis|Male|Olfactory Bulb|Organophosphorus Compounds|ROC Curve|Rats|Rats,...","[administration, intranasal, animals, area under curve, central nervous system, cerebellar cortex, chromatography, high pressure liquid, glutamate carboxypeptidase ii, glutarates, half-life, injections, intraperitoneal, macaca fascicularis, male, olfactory bulb, organophosphorus compounds, roc c...",
4,26151904,Animals|Carrier Proteins|Centromere|Chromatin|Chromatin Assembly and Disassembly|DNA-Binding Proteins|Drosophila Proteins|Drosophila melanogaster|Histones|Mitosis,"[animals, carrier proteins, centromere, chromatin, chromatin assembly and disassembly, dna-binding proteins, drosophila proteins, drosophila melanogaster, histones, mitosis]",


In [None]:
mesh_subset = mesh[:500]
pd.set_option('max_colwidth',300)
mesh_subset.head()

- create tsv file with PMID, mental_count, neoplasms_count, nervous_count, respiratory_count, circulatory_count



- Plot individual FIPS codes


### Web Scrape MeSH trees

In [22]:
import requests
from bs4 import BeautifulSoup
import re
from fuzzywuzzy import fuzz
import time
from multiprocessing.dummy import Pool as ThreadPool 


circulatory = 'https://www.nlm.nih.gov/mesh/2015/mesh_trees/C14.html'
respiratory = 'https://www.nlm.nih.gov/mesh/2015/mesh_trees/C08.html'
neoplasms = 'https://www.nlm.nih.gov/mesh/2015/mesh_trees/C04.html'
nervous = 'https://www.nlm.nih.gov/mesh/2015/mesh_trees/C10.html'
mental = 'https://www.nlm.nih.gov/mesh/2015/mesh_trees/F03.html'

#urls = [circulatory,respiratory,neoplasms,nervous,mental]



def scrape_and_clean(url):
    print(url)
    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, 'lxml')
    toplevel = soup.find('ul', {'class':'Level1'})
    list_all = toplevel.get_text().split('\n')
    print(list_all[0])
    list_all = list(filter(None, list_all))
    regex = re.compile(r'\[.*\]', re.IGNORECASE)
    clean_list = [regex.sub("", line).strip().lower() for line in list_all]
    return clean_list


# for l in urls:
#     scrape_and_clean(l)

In [23]:
tree_dict = dict()
tree_dict['circulatory'] = set(scrape_and_clean(circulatory))
tree_dict['mental'] = set(scrape_and_clean(mental))
tree_dict['respiratory'] = set(scrape_and_clean(respiratory))
tree_dict['nervous'] = set(scrape_and_clean(nervous))
tree_dict['neoplasms'] = set(scrape_and_clean(neoplasms))

https://www.nlm.nih.gov/mesh/2015/mesh_trees/C14.html
Cardiovascular Diseases [C14]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/F03.html
Mental Disorders [F03]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C08.html
Respiratory Tract Diseases [C08]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C10.html
Nervous System Diseases [C10]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C04.html
Neoplasms [C04]


In [14]:
circulatory_list = scrape_and_clean(circulatory)
respiratory_list = scrape_and_clean(respiratory)
mental_list = scrape_and_clean(mental)
neoplasms_list = scrape_and_clean(neoplasms)
nervous_list = scrape_and_clean(nervous)

https://www.nlm.nih.gov/mesh/2015/mesh_trees/C14.html
Cardiovascular Diseases [C14]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C08.html
Respiratory Tract Diseases [C08]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/F03.html
Mental Disorders [F03]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C04.html
Neoplasms [C04]
https://www.nlm.nih.gov/mesh/2015/mesh_trees/C10.html
Nervous System Diseases [C10]


In [25]:
def check_fuzzy(check_set, word):
    """
    Compares a term to each term in a set (e.g. mesh terms from `tree_dict`)
    and returns the maximum fuzzy match
    """
    ratios = []
    for i in check_set:
        ratio = fuzz.SequenceMatcher(isjunk=None,seq1=i, seq2=word).quick_ratio()
        ratios.append((i,ratio))
    maxval = max(ratios,key=lambda item:item[1])
    return maxval
#check_fuzzy(mental_list, 'schizophrenia')

set()

In [None]:
def add_mesh_groups(df, dictionary):
    """
    Iterates over each mesh term in `cleantext` column of the df 
    and checks for it in the dictionary of mesh trees. When a match is found, 
    the tree category (e.g. 'circulatory','respiratory', 'neoplasms', 'mental','nervous')
    is added to a set in `mesh_groups` column of the df
    """
    for idx, row in df.iterrows():
        for term in row['cleantext']:
        #print(term)
            for k,v in dictionary.items():
                if term in v:
                    #print(k, term)
                    if pd.isnull(row['mesh_groups']) == True:
                        row['mesh_groups'] = {k}
                    else:
                        row['mesh_groups'] = row['mesh_groups'].union({k})

# uncomment this to also add fuzzy matches over 0.9 to df
# (fn will take about 0.11 seconds per row in df):

#                elif check_fuzzy(v, term)[1] > 0.9:
#                    if pd.isnull(row['mesh_groups']) == True:
#                        row['mesh_groups'] = {k}
#                    else:
#                        row['mesh_groups'] = row['mesh_groups'].union({k})

                else:
                    continue

In [68]:
%%time 

add_mesh_groups(mdf_subset, tree_dict)

CPU times: user 81.8 ms, sys: 7.69 ms, total: 89.5 ms
Wall time: 89.7 ms


In [92]:
mdf_subset.loc[(pd.isnull(mdf_subset.mesh_groups) == False),:]

Unnamed: 0,PMID,mesh,cleantext,mesh_groups
0,26151925,"Adult|Amygdala|Case-Control Studies|Female|Humans|Male|Neuroimaging|Neurokinin-1 Receptor Antagonists|Phobic Disorders|Piperidines|Positron-Emission Tomography|Receptors, Neurokinin-1|Tetrazoles","[adult, amygdala, case-control studies, female, humans, male, neuroimaging, neurokinin-1 receptor antagonists, phobic disorders, piperidines, positron-emission tomography, receptors, neurokinin-1, tetrazoles]",{mental}
7,26151900,"Adult|Aged|Aged, 80 and over|Anomia|Cerebral Infarction|Female|Humans|Male|Mental Recall|Middle Aged|Narration|Neuropsychological Tests|Pattern Recognition, Visual|Psychometrics|Reproducibility of Results|Semantics|Vocabulary","[adult, aged, aged, 80 and over, anomia, cerebral infarction, female, humans, male, mental recall, middle aged, narration, neuropsychological tests, pattern recognition, visual, psychometrics, reproducibility of results, semantics, vocabulary]","{circulatory, nervous}"
11,26151864,Cross-Sectional Studies|Female|Hip Fractures|Hospital Administration|Hospitals|Humans|Male|Myocardial Infarction|Patient Participation|Patient Satisfaction|Patients|Quality Improvement|Stroke|Surveys and Questionnaires,"[cross-sectional studies, female, hip fractures, hospital administration, hospitals, humans, male, myocardial infarction, patient participation, patient satisfaction, patients, quality improvement, stroke, surveys and questionnaires]","{circulatory, nervous}"
12,26151861,"Analysis of Variance|Attitude of Health Personnel|Cognitive Therapy|Education, Continuing|Evidence-Based Practice|Guideline Adherence|Health Personnel|Humans|Mississippi|Practice Guidelines as Topic|Substance-Related Disorders","[analysis of variance, attitude of health personnel, cognitive therapy, education, continuing, evidence-based practice, guideline adherence, health personnel, humans, mississippi, practice guidelines as topic, substance-related disorders]",{mental}
13,26151850,"Animals|History, 20th Century|History, 21st Century|Humans|Neoplasms|Photochemotherapy","[animals, history, 20th century, history, 21st century, humans, neoplasms, photochemotherapy]",{neoplasms}
14,26151848,"Animals|Behavior, Animal|Cerebral Cortex|Chemokines|Cytokines|Hippocampus|Kynurenine|Lupus Vasculitis, Central Nervous System|Memory|Mice|Mice, Inbred C57BL","[animals, behavior, animal, cerebral cortex, chemokines, cytokines, hippocampus, kynurenine, lupus vasculitis, central nervous system, memory, mice, mice, inbred c57bl]","{circulatory, nervous}"
16,26151842,"Animals|Chromosomes, Human|Collagen Type III|Disease Models, Animal|Fibrosis|Gene Regulatory Networks|Humans|Lung Diseases|Mice|Quantitative Trait Loci|Species Specificity|Synteny|Translational Medical Research","[animals, chromosomes, human, collagen type iii, disease models, animal, fibrosis, gene regulatory networks, humans, lung diseases, mice, quantitative trait loci, species specificity, synteny, translational medical research]",{respiratory}
17,26151840,"Algorithms|Brain|Brain Neoplasms|Humans|Image Processing, Computer-Assisted|Magnetic Resonance Imaging|Oligodendroglioma|Phantoms, Imaging","[algorithms, brain, brain neoplasms, humans, image processing, computer-assisted, magnetic resonance imaging, oligodendroglioma, phantoms, imaging]","{neoplasms, nervous}"
18,26151827,"Angiotensin II|Animals|Blood Pressure|Disease Models, Animal|Gene Knockout Techniques|Hypertension|Kidney Tubules, Collecting|Male|Mice|Receptor, Bradykinin B2|Sodium Chloride, Dietary","[angiotensin ii, animals, blood pressure, disease models, animal, gene knockout techniques, hypertension, kidney tubules, collecting, male, mice, receptor, bradykinin b2, sodium chloride, dietary]",{circulatory}
20,26151821,"Case-Control Studies|Colorectal Neoplasms|Genetic Predisposition to Disease|Genome-Wide Association Study|Humans|Odds Ratio|Polymorphism, Single Nucleotide","[case-control studies, colorectal neoplasms, genetic predisposition to disease, genome-wide association study, humans, odds ratio, polymorphism, single nucleotide]",{neoplasms}


In [28]:
mdf.to_csv('./data/mesh_groups.csv', index=False)

In [34]:
len(mdf.loc[(pd.isnull(mdf.mesh_groups) == False),:])

1640238

In [93]:
len(mdf)
# pd.merge()

4984444

In [40]:
pd.set_option('max_colwidth',300)
mdf.head()

Unnamed: 0,PMID,mesh,cleantext,mesh_groups
0,26151925,"Adult|Amygdala|Case-Control Studies|Female|Humans|Male|Neuroimaging|Neurokinin-1 Receptor Antagonists|Phobic Disorders|Piperidines|Positron-Emission Tomography|Receptors, Neurokinin-1|Tetrazoles","[adult, amygdala, case-control studies, female, humans, male, neuroimaging, neurokinin-1 receptor antagonists, phobic disorders, piperidines, positron-emission tomography, receptors, neurokinin-1, tetrazoles]",{mental}
1,26151913,"Animals|Cell Differentiation|Gene Expression Regulation|Male|Mice|Mice, Inbred mdx|Mice, Transgenic|MicroRNAs|Muscle, Skeletal|Muscular Dystrophy, Animal|PAX7 Transcription Factor|Physical Conditioning, Animal|Regeneration|Satellite Cells, Skeletal Muscle","[animals, cell differentiation, gene expression regulation, male, mice, mice, inbred mdx, mice, transgenic, micrornas, muscle, skeletal, muscular dystrophy, animal, pax7 transcription factor, physical conditioning, animal, regeneration, satellite cells, skeletal muscle]",
2,26151911,"Acoustic Stimulation|Animals|Brain|Conditioning (Psychology)|Conditioning, Classical|Cues|Dendritic Spines|Electric Stimulation|Extremities|Fear|Male|Mice|Neuronal Plasticity","[acoustic stimulation, animals, brain, conditioning (psychology), conditioning, classical, cues, dendritic spines, electric stimulation, extremities, fear, male, mice, neuronal plasticity]",
3,26151906,"Administration, Intranasal|Animals|Area Under Curve|Central Nervous System|Cerebellar Cortex|Chromatography, High Pressure Liquid|Glutamate Carboxypeptidase II|Glutarates|Half-Life|Injections, Intraperitoneal|Macaca fascicularis|Male|Olfactory Bulb|Organophosphorus Compounds|ROC Curve|Rats|Rats,...","[administration, intranasal, animals, area under curve, central nervous system, cerebellar cortex, chromatography, high pressure liquid, glutamate carboxypeptidase ii, glutarates, half-life, injections, intraperitoneal, macaca fascicularis, male, olfactory bulb, organophosphorus compounds, roc c...",
4,26151904,Animals|Carrier Proteins|Centromere|Chromatin|Chromatin Assembly and Disassembly|DNA-Binding Proteins|Drosophila Proteins|Drosophila melanogaster|Histones|Mitosis,"[animals, carrier proteins, centromere, chromatin, chromatin assembly and disassembly, dna-binding proteins, drosophila proteins, drosophila melanogaster, histones, mitosis]",


In [63]:
# mdf_subset = mdf[:500]
# mdf_subset.loc[(pd.isnull(mdf_subset.mesh_groups) == False),:]
mdf_subset.is_copy = False
mdf_subset.loc[:,['mesh_groups']] = None

In [64]:
mdf_subset.head()

Unnamed: 0,PMID,mesh,cleantext,mesh_groups
0,26151925,"Adult|Amygdala|Case-Control Studies|Female|Humans|Male|Neuroimaging|Neurokinin-1 Receptor Antagonists|Phobic Disorders|Piperidines|Positron-Emission Tomography|Receptors, Neurokinin-1|Tetrazoles","[adult, amygdala, case-control studies, female, humans, male, neuroimaging, neurokinin-1 receptor antagonists, phobic disorders, piperidines, positron-emission tomography, receptors, neurokinin-1, tetrazoles]",
1,26151913,"Animals|Cell Differentiation|Gene Expression Regulation|Male|Mice|Mice, Inbred mdx|Mice, Transgenic|MicroRNAs|Muscle, Skeletal|Muscular Dystrophy, Animal|PAX7 Transcription Factor|Physical Conditioning, Animal|Regeneration|Satellite Cells, Skeletal Muscle","[animals, cell differentiation, gene expression regulation, male, mice, mice, inbred mdx, mice, transgenic, micrornas, muscle, skeletal, muscular dystrophy, animal, pax7 transcription factor, physical conditioning, animal, regeneration, satellite cells, skeletal muscle]",
2,26151911,"Acoustic Stimulation|Animals|Brain|Conditioning (Psychology)|Conditioning, Classical|Cues|Dendritic Spines|Electric Stimulation|Extremities|Fear|Male|Mice|Neuronal Plasticity","[acoustic stimulation, animals, brain, conditioning (psychology), conditioning, classical, cues, dendritic spines, electric stimulation, extremities, fear, male, mice, neuronal plasticity]",
3,26151906,"Administration, Intranasal|Animals|Area Under Curve|Central Nervous System|Cerebellar Cortex|Chromatography, High Pressure Liquid|Glutamate Carboxypeptidase II|Glutarates|Half-Life|Injections, Intraperitoneal|Macaca fascicularis|Male|Olfactory Bulb|Organophosphorus Compounds|ROC Curve|Rats|Rats,...","[administration, intranasal, animals, area under curve, central nervous system, cerebellar cortex, chromatography, high pressure liquid, glutamate carboxypeptidase ii, glutarates, half-life, injections, intraperitoneal, macaca fascicularis, male, olfactory bulb, organophosphorus compounds, roc c...",
4,26151904,Animals|Carrier Proteins|Centromere|Chromatin|Chromatin Assembly and Disassembly|DNA-Binding Proteins|Drosophila Proteins|Drosophila melanogaster|Histones|Mitosis,"[animals, carrier proteins, centromere, chromatin, chromatin assembly and disassembly, dna-binding proteins, drosophila proteins, drosophila melanogaster, histones, mitosis]",


In [None]:
### Test with threading -- didn't save significant time 

%%time

from multiprocessing.dummy import Pool as ThreadPool 

# Make the Pool of workers
pool = ThreadPool(3) 

start_time = time.time()
for idx, row in mdf_subset.iterrows():
    # bottleneck
    group_list = pool.map(check_group, [term for term in row['cleantext']])
    for group_set in group_list:
        if len(group_set) > 0:
            if pd.isnull(row['mesh_groups']) == True:
                row['mesh_groups'] = group_set
            else:
                row['mesh_groups'] = row['mesh_groups'].union(group_set)

pool.close() 
pool.join() 

# Questions
- F01 Behavioral tree? map mesh to mental?
    - https://www.nlm.nih.gov/mesh/2015/mesh_trees/F01.html
- mesh_subset with fuzzy matching took 55 seconds for 500 rows
    - 162 rows × 4 columns
- without fuzzy matching took 90 ms for 500 rows
    - 156 rows × 4 columns

In [60]:
mdf.shape

(4984444, 4)

In [61]:
mdf_subset.shape

(500, 4)

In [83]:
mesh.shape

(23354735, 2)

In [84]:
secs_per_row_fuzz = 55/500
print(secs_per_row_fuzz)

secs_per_row_nofuzz = .09/500
print(secs_per_row_nofuzz)

0.11
0.00017999999999999998


This would take 6.3 days with fuzzy matching

In [91]:
print("USA mapaffil mesh; with fuzzy, days=",mdf.shape[0] * secs_per_row_fuzz/60/60/24)
print("USA mapaffil mesh; no fuzzy, hours=", mdf.shape[0] * secs_per_row_nofuzz/60/60)
print("all mesh; fuzzy, days=",mesh.shape[0] * secs_per_row_fuzz/60/60/24)
print("all mesh; no fuzzy, hours=",mesh.shape[0] * secs_per_row_nofuzz/60/60)

USA mapaffil mesh; with fuzzy, days= 6.345935648148148
USA mapaffil mesh; no fuzzy, hours= 0.24922219999999998
all mesh; fuzzy, days= 29.734037615740743
all mesh; no fuzzy, hours= 1.1677367499999998
