In [1]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import string
from spacy.lang.en import English
import re
import spacy
nlp = spacy.load('en_core_web_lg')
nlpSci = spacy.load("en_ner_bc5cdr_md")



In [2]:
filename = 'Abs.xlsx'
abstracts = pd.read_excel(filename, header=None, skiprows=[0], names=['pmid', 'abs'])
abstracts.dropna(inplace=True)

### Case reports
- All articles classified by PubMed as case reports
- Publication types are retrived via an API; see "Filtering by PubType"
- These articles are included because they report on novel cases that can contribute to prevalence information
- Article count: 1792

In [3]:
filename = 'pub_types.csv'
pubTypes = pd.read_csv(filename, header=None, skiprows=[0], names=['pmid', 'types'])
pubTypes.dropna(inplace=True)

In [13]:
case_report_mask = pubTypes.types.str.contains('Case Reports', regex=False)

In [14]:
caseReportsMasked = pubTypes[case_report_mask]

In [15]:
len(caseReportsMasked.index)

1792

In [16]:
caseReports = abstracts.merge(caseReportsMasked['pmid'],on=['pmid'])

In [18]:
for pmid, row in caseReports.iterrows():
    print(row['abs'])

Two brothers had retinal degeneration, lens subluxation, and myopia since early life. There was no evidence of Marfan syndrome, homocystinuria, or other systemic disease. They had nystagmus, myopia, inferior dislocation of the lens, and posterior subcapsular opacities in both eyes. Fundus examination showed attenuated retinal vessels, macular atrophy with occasional pigment accumulation as clumps, and perivascular sleeves. Electroretinography revealed decreased photopic and scotopic responses. The visual fields were constricted. We believe this to be the first report of retinal degeneration with bilateral lens subluxation in a family. It appears to be inherited in an autosomal recessive fashion.
The molecular basis of X-linked recessive anhidrotic ectodermal dysplasia with immunodeficiency (EDA-ID) has remained elusive. Here we report hypomorphic mutations in the gene IKBKG in 12 males with EDA-ID from 8 kindreds, and 2 patients with a related and hitherto unrecognized syndrome of EDA-

### Direct keyword mentions
- All articles that mention: prevalence, registry, incidence, epidemiology, PR
- Article count: 1601 total, 1525 excluding case reports

In [21]:
parser = English()
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

def tokenizeText(sample, returnType='set'):
    tokens = parser(sample)
    
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    if len(tokens) == 1:
        return tokens[0]
    return set(tokens)

keywordTokens = {tokenizeText('prevalence'), tokenizeText('incidence')
                 ,tokenizeText('epidemiology'), tokenizeText('PR')}

def includesKeywords(sample, verbose=False):
    tokens = tokenizeText(sample)
    intersection = tokens & keywordTokens
    if intersection != set(): # intersection
        if verbose:
            print(intersection)
        return True
    return False

In [51]:
keywordMentions = abstracts[abstracts['abs'].apply(lambda x: includesKeywords(x))]

In [52]:
len(keywordMentions.index)

1580

In [53]:
keywordMentionsNoCases = keywordMentions[(~keywordMentions.pmid.isin(caseReports.pmid))]

In [60]:
keywordMentionsNoCases.index

Int64Index([   1,    2,   12,   13,   16,   17,   33,   39,   44,   48,
            ...
            5706, 5713, 5717, 5718, 5725, 5737, 5740, 5742, 5744, 5748],
           dtype='int64', length=1505)

In [54]:
len(keywordMentionsNoCases.index)

1505

In [61]:
keywordMentionsNoCases.to_csv('keyword_abstracts.csv', index=False)

In [16]:
positive = keywordMentionsNoCases

### Phrase matching
- Cover the patterns in phrases with more variability

In [22]:
def set_custom_boundaries(doc):
    prev = doc[0].text
    numbers = re.compile('^[0-9]$')
    for token in doc[:-1]:
        if token.text == ',' or token.text == ';' or token.text == ':' and not numbers.match(prev):
            doc[token.i+1].is_sent_start = True
        prev = token.text
    return doc

In [23]:
nlp.add_pipe(set_custom_boundaries, before="parser")

Identifies phrases consisting of a numerical value, an indication of a patient, and a reporting verb (identify, report, find, etc)
- example: "30 male patients with the disease have been reported"
- identifies 680 total articles, 201 not already identified by the above methods

In [24]:
# Checks if the patient entity is directly related to the identified entity
def identifyXCases(text, verbose=False):
    patients = {'patients','cases','subjects','individuals','family','families','children',
                'girls','boys','males','females','adults'}
    identified = {'identified','reported','seen','found','identify','find','report','review','registered'}
    doc = nlp(text)
    nounVerbMatch = False # if linked patient-reporting verb phrase is identified
    numberMatch = False # if a numerical value describing a patient entity is identified
    
    for sent in doc.sents: # check each sentence for the phrase pattern
        nounVerbMatch = False
        numberMatch = False

        for token in sent:
            if token.ent_type_ in {'CARDINAL', 'ORDINAL'} and str(doc.ents[0])!='number':
                if token.head.text in patients: # number describes patients
                    numberMatch = True
            if token.text in patients:
                if token.head.text in identified: # patients are linked to a reporting verb
                    nounVerbMatch = True
                for child in token.children:
                    if child.text in identified: # reporting verb is linked to patients
                        nounVerbMatch = True
                    if child.ent_type in {'CARDINAL', 'ORDINAL'} and str(doc.ents[0])!='number': # patients describe a number
                        numberMatch = True
            if token.text in identified:
                if token.head.text in patients: # reporting verb is linked to patient entity
                    nounVerbMatch = True
        if numberMatch and nounVerbMatch:
            if verbose:
                print(sent)
            return True
                
    return False

In [25]:
df_identifyXCases = abstracts[abstracts['abs'].apply(lambda x: identifyXCases(x))]

In [26]:
len(df_identifyXCases.index)

708

In [28]:
phraseMatches = df_identifyXCases

In [27]:
identifyXCasesNew = df_identifyXCases[(~df_identifyXCases.pmid.isin(positive.pmid))]

NameError: name 'positive' is not defined

In [None]:
len(identifyXCasesNew.index)

In [None]:
positive = pd.concat([positive, identifyXCasesNew])

In [None]:
len(positive.index)

In [None]:
i=0
for index, row in identifyXCasesNew.iterrows():
    identifyXCases(row['abs'], True)
    if i>20:
        break
    i+=1

Detects: previously unreported, newly identified, birth rate

In [29]:
def identifyShortPhrase(text):
    result = re.search(r"(birth rate)|(previously unreported)|(newly identified)", text)
    #print(result.group(0))
    if result!=None:
        return True
    return False

In [30]:
shortPhrases = abstracts[abstracts['abs'].apply(lambda x: identifyShortPhrase(x))]

In [31]:
len(shortPhrases.index)

77

In [33]:
phraseMatches = pd.concat([phraseMatches, shortPhrasesNew])

In [32]:
shortPhrasesNew = shortPhrases[(~shortPhrases.pmid.isin(phraseMatches.pmid))]

In [None]:
len(shortPhrasesNew.index)

In [None]:
positive = pd.concat([positive, shortPhrasesNew])

In [None]:
len(positive.index)

In [37]:
for pmid, row in shortPhrasesNew.iterrows():
    print(row['abs'])
    print('\n')

We describe 3 unrelated newborn males with a previously unreported constellation of congenital anomalies. All 3 died neonatally of hepatic failure. Clinically, they presented with a pattern of malformations characterized by prenatal linear growth deficiency, hypertrophied alveolar ridges, redundant nuchal skin, and postaxial polydactyly. All 3 cases had male external genitalia with cryptorchidism, and 2 of them, a small penis. Necropsies showed similar internal anomalies, consisting of m√ºllerian duct remnants, lymphangiectasis, and renal anomalies. The karyotypes were normal (46, XY) in skin fibroblasts (Case 1) and in peripheral blood lymphocytes (Case 3). Although this pattern of congenital anomalies must be differentiated from several other lethal syndromes, to our knowledge, no similar cases have been described previously. Cause of this syndrome is unknown. Because Case 2 had a previous brother with similar anomalies, we suspect that this new entity probably is an autosomal recess

match "x reports"

In [34]:
# Checks if the patient entity is directly related to the identified entity
def xReports(text, verbose=False):
    doc = nlp(text)
    numberMatch = False # if a numerical value describing a patient entity is identified
    
    for sent in doc.sents: # check each sentence for the phrase pattern
        numberMatch = False

        for token in sent:
            if token.ent_type_ in {'CARDINAL', 'ORDINAL'} and str(doc.ents[0]) not in {'number','first'}:
                if token.head.text == 'reports': # number describes patients
                    numberMatch = True
            if token.text == 'reports':
                for child in token.children:
                    if child.ent_type in {'CARDINAL', 'ORDINAL'} and str(doc.ents[0]) not in {'number','first'}: # patients describe a number
                        numberMatch = True
        if numberMatch:
            if verbose:
                print(sent)
            return True
                
    return False

In [35]:
df_xReports = abstracts[abstracts['abs'].apply(lambda x: xReports(x))]

In [37]:
xReportsNew = df_xReports[(~df_xReports.pmid.isin(phraseMatches.pmid))]

In [38]:
phraseMatches = pd.concat([phraseMatches, xReportsNew])

In [None]:
len(xReportsNew.index)

In [None]:
len(positive.index)

In [44]:
for pmid, row in xReportsNew.iterrows():
    print(pmid)
    xReports(row['abs'], True)

35
Two previous case reports described two sibs affected with both sensorineural hearing loss and oligodontia.
93
there remains a dearth of case reports (currently fewer than 50) in the literature.
244
This is one of the first reports documenting the molecular mechanism of an allosteric enzyme activator using MD simulations.
869
A review of the literature found 34 reports of supernumerary ring chromosome I which are compared to our case.
1023
This review is based on 21 reports published in the English medical literature since 2009.
1281
This familial case report and two other previous reports demonstrate that autosomal-dominant mutations in the DSP gene are associated with hypo/oligodontia in the setting of Carvajal/Naxos syndrome.
1484
We report a 14-year-old adolescent girl with selective mutism (SM) and a 7q11.23 microduplication detected by chromosomal microarray (CMA) analysis and reviewed the literature from 18 published clinical reports.
1725
less than 10 case reports or clinica

Remaining:
- "affect ... x person in x"
- "rate of dz estimated at x"
- "estimated x patients per x"

In [39]:
# Checks if the patient entity is directly related to the identified entity
def affectX(text, verbose=False):
    doc = nlp(text)
    patients = {'people','person','patient','child','birth','pregnancy'}
    affects = {'affects','afflicts','affect','afflict','diagnosed','diagnose','estimate','estimated'}
    patientTokens = set()
    for noun in patients:
        patientTokens.add(tokenizeText(noun))
    
    for sent in doc.sents: # check each sentence for the phrase pattern
        nounVerbMatch = False
        number = False

        for token in sent:
            if token.ent_type_ in {'CARDINAL', 'ORDINAL'} and str(doc.ents[0]) not in {'number','first'}:
                number = True
            if tokenizeText(token.text) in patients:
                if token.head.text in affects:
                    nounVerbMatch = True
            if token.text in affects:
                for child in token.children:
                    if tokenizeText(child.text) in patients:
                        nounVerbMatch = True
        if number and nounVerbMatch:
            if verbose:
                print(sent)
            return True
                
    return False

In [40]:
df_xAffects = abstracts[abstracts['abs'].apply(lambda x: affectX(x))]

In [41]:
df_xAffectsNew = df_xAffects[(~df_xAffects.pmid.isin(phraseMatches.pmid))]

In [42]:
phraseMatches = pd.concat([phraseMatches, df_xAffectsNew])

In [50]:
for index, row in df_xAffectsNew.iterrows():
    affectX(row['abs'],True)

Autosomal recessive ataxias affect about 1 person in 20,000.
Cerebral palsy is estimated to affect nearly 1 in 500 children,
Classical DBA affects about seven per million live births and presents during the first year of life.
One patient with slightly elevated free T4 and normal TSH was diagnosed as having familial dysalbuminemic hyperthyroxinemia (FDH).
21 unscreened patients with metabolic disorders diagnosed after 5 days of life died or had a significant intellectual or physical handicap (1.35/100,000 population) compared with 2 of the screened cohort (0.43/100,000;
DT represents one of the most significant causes of the morbidity and mortality that affects FAP patients following colectomy.
3 patients have been diagnosed with Nager syndrome (NS) during the last 17 years.
One child was diagnosed by cordocentesis at 30 weeks of gestation.
Our study estimated 1.42 recurrent respiratory papillomatosis patients per 100 000 in the general UK population.
eight patients with pentalogy of C

In [43]:
def freq(text):
    doc = nlp(text)
    docSci = nlpSci(text)
    dz = {'disease','condition','syndrome','disorder','diagnosis','cases'}
    for sent in docSci.sents:
        root = [token for token in sent if token.head == token][0]
        subject = list(root.lefts)
        if len(subject)>0:
            subject = subject[0]
            for descendant in subject.subtree:
                if descendant.text in dz or descendant.ent_type_ == 'DISEASE':
                    for ancestor in descendant.ancestors:
                        if ancestor.text == 'frequency':
                            return True
    return False


In [44]:
df_freq = abstracts[abstracts['abs'].apply(lambda x: freq(x))]

In [46]:
new_freq = df_freq[(~df_freq.pmid.isin(phraseMatches.pmid))]

In [47]:
len(phraseMatches.index)

835

In [56]:
for index, row in new_freq.iterrows():
    print(row['abs'],'\n')

Mucolipidosis type IV (MLIV) is a neurodegenerative lysosomal storage disorder that occurs in an increased frequency in the Ashkenazi Jewish (AJ) population. The frequency of the disease in this population has been established by the testing of 66,749 AJ subjects in the Dor Yeshorim program, a unique premarital population-screening program designed for the Orthodox Jewish community. A carrier rate of 0.0104 (95% C.I 0.0097-0.011) was found. The distribution of the 2 AJ founder mutations, namely, c.416-2A>G and c.1_788del, was determined to be 78.15% and 21.85%, respectively. Three novel mutations were identified in non-Jewish MLIV patients, a missense mutation c.1207C>T, p.Arg403Cys; a 2bp deletion, c.302_303delTC; and a nonsense, c.235C>T, Gln79X. 

Mevalonate kinase deficiency (MKD), a very rare autosomal recessive autoinflammatory disease with multiple organ involvement, presents clinically as hyperimmunoglobulinemia D syndrome (HIDS), a less severe phenotype and more common form, a