# Goals (in no particular order)
- Analyze differences between the EBI and NCBI APIs
- Check if the manual search of PubMed for case study was in the API results (was it an error in our pipeline or in EBI/NCBI APIs
- Convert Orphanet Data into more readable format 

### Analyzing the differences between the EBI and NCBI APIs

Results:

"As we noticed that the results returned from the EBI and NCBI APIs differed significantly, we compared their similarity using the Jaccard index. The highest similarity from a small sample of term was ‘Morphea’ with a 45.0% similarity, the lowest was ‘Santos Mateus Leal syndrome’ with a 2.63% similarity. The EBI API returned more results in every case."

In [1]:
from classify_abs import search_NCBI_API, search_EBI_API, search_getAbs
from extract_abs import autosearch, load_GARD_diseases
import pandas as pd
import csv

GARD_dict, max_length = load_GARD_diseases()

In [2]:
searchlist = ['Morphea','GARD:0007383','Sandhoff disease','Santos Mateus Leal syndrome','Facioscapulohumeral muscular dystrophy','Fellman syndrome','Mucopolysaccharidosis type 6','Neurofibromatosis type 1','classic homocystinuria','0007730',7383]

In [3]:
for searching in searchlist:
    NCBI_API = search_NCBI_API(autosearch(searching,GARD_dict), 50)
    EBIAPI = search_EBI_API(autosearch(searching,GARD_dict), 50)

    intersect = set(EBIAPI.keys()).intersection(set(NCBI_API.keys()))
    U = set(EBIAPI.keys()).union(set(NCBI_API.keys()))

    print('EBI Result',len(EBIAPI.keys()))
    print('NCBI Result',len(NCBI_API.keys()))
    print(searching,'intersection: ',len(intersect),', union: ',len(U))
    print(searching,100*len(intersect)/len(U),'% intersection')
    print('')
    
    #SAVE RESULTS
    # csv header
    header = ['EBI API PMIDS', 'NCBI API PMIDS']
    NCBI_list = list(NCBI_API.keys())
    EBI_list = list(EBIAPI.keys())

    import csv
    with open(str('API-Analysis/'+str(searching)+' API Results.csv'), 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        for i in range(max(len(NCBI_list),len(EBI_list))):
            EBI_fill = ''
            NCBI_fill = ''
            if i<len(NCBI_list):
                NCBI_fill = NCBI_list[i]
            if i<len(EBI_list):
                EBI_fill = EBI_list[i]
            writer.writerow([EBI_fill,NCBI_fill])

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['morphea']
search_NCBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['morphea']
search_EBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
EBI Result 16
NCBI Result 16
Morphea intersection:  11 , union:  21
Morphea 52.38095238095238 % intersection

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['phenylalanine hydroxylase deficiency', 'oligophrenia phenylpyruvica', 'phenylketonuria', 'folling disease']
search_NCBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['phenylalanine hydroxylase deficiency', 'oligophrenia phenylpyruvica', 'phenylketonuria', 'folling disease']
search_EBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
EBI Result 49
NCBI Result 37
GARD:0007383 intersection:  12 , union:  74
GARD:000738

EBI Result 50
NCBI Result 27
0007730 intersection:  10 , union:  67
0007730 14.925373134328359 % intersection

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['phenylalanine hydroxylase deficiency', 'oligophrenia phenylpyruvica', 'phenylketonuria', 'folling disease']
search_NCBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['phenylalanine hydroxylase deficiency', 'oligophrenia phenylpyruvica', 'phenylketonuria', 'folling disease']
search_EBI_API is DEPRECATED. Utilize search_getAbs for most comprehensive results.
EBI Result 49
NCBI Result 37
7383 intersection:  12 , union:  74
7383 16.216216216216218 % intersection



### Convert Orphanet data to a readable and searchable format

In [4]:
import xml.etree.ElementTree as ET
import re
import pandas as pd

In [5]:
#This file was downloaded on August 31, 2021. See README.md for details
tree = ET.parse('en_product9_prev.xml')
root = tree.getroot()

In [6]:
df = pd.DataFrame(columns=['OrphaCode',
                           'Disease Name',
                           'Orpha Epi Type',
                           'Orpha Epi Class',
                           'Orpha Epi Rate',
                           'Orpha Loc',
                           'Source',
                           'PMID'])

In [7]:
i=0
for disorder in root.iter('Disorder'):  
    code = disorder.find('./OrphaCode').text
    name = disorder.find('./Name').text
    #Each disorder, w/code and name, has multiple prevalence branches
    for prevalence in disorder.findall('./PrevalenceList/Prevalence'):
        EPtype = prevalence.find('./PrevalenceType/Name').text
        if 'class' in prevalence.find('./PrevalenceQualification/Name').text.lower():
            EPclss = prevalence.find('./PrevalenceClass/Name').text
        else:
            EPclss = ''
        EPrate = prevalence.find('./ValMoy').text
        geoloc = prevalence.find('./PrevalenceGeographic/Name').text
        source = prevalence.find('./Source').text
        #each prevalence, w/geoloc and source, has multiple pmids w/abstracts
        if 'PMID' in str(source) and len(EPclss)>1:
            pmids = re.findall('\d{6,8}', source)
            for articleid in pmids:
                pmid = articleid
                df = df.append({'OrphaCode':code,
                                    'Disease Name':name,
                                    'Orpha Epi Type':EPtype,
                                    'Orpha Epi Class':EPclss,
                                    'Orpha Epi Rate':EPrate,
                                    'Orpha Loc':geoloc,
                                    'Source':source,
                                    'PMID':pmid}
                                   , ignore_index=True)
                i+=1
                if i%500==0:
                    print(i)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000


In [8]:
df

Unnamed: 0,OrphaCode,Disease Name,Orpha Epi Type,Orpha Epi Class,Orpha Epi Rate,Orpha Loc,Source,PMID
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_11389160[PMID]_9689990[PMID],11389160
1,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_11389160[PMID]_9689990[PMID],9689990
2,58,Alexander disease,Annual incidence,<1 / 1 000 000,0.037,Japan,21533827[PMID]_[EXPERT],21533827
3,166032,"Multiple epiphyseal dysplasia, with miniepiphyses",Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_15523498[PMID],15523498
4,61,Alpha-mannosidosis,Prevalence at birth,<1 / 1 000 000,0.09,Australia,9918480[PMID],9918480
...,...,...,...,...,...,...,...,...
7202,99807,PEHO-like syndrome,Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_15968934[PMID]_22408680[PMID],15968934
7203,99807,PEHO-like syndrome,Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_15968934[PMID]_22408680[PMID],22408680
7204,99789,Dentin dysplasia type I,Point prevalence,1-9 / 100 000,1.0,Europe,17452557[PMID]_[EXPERT],17452557
7205,99792,Dentin dysplasia-sclerotic bones syndrome,Point prevalence,<1 / 1 000 000,0.0,Worldwide,ORPHANET_264650[PMID],264650


In [9]:
#Not going to remove the duplicate PMIDs because Orphanet has different stat, disease, loc, info for each duplicated entry.
df.to_csv('All Orphanet Data.csv', header = True, index=False, encoding='utf-8')
print('DONE')

DONE


### Check the API results against manual search of PubMed

In [10]:
search_results = search_getAbs(autosearch('FSHMD1A', GARD_dict), 100, 'none')
#These four were found in a manual search of pubmed for all of the disease names and synonyms
manual = {33092966, 26218298, 28690764, 11525880}
#These were all the pmids cited in orphanet for this disease
orpha = {880742, 1887846, 19320656, 22217918, 19767415, 19320656, 19767415, 1745328, 25122204, 25122204}
print('MANUAL')
for pmid in manual:
    if str(pmid) in search_results.keys():
        print(pmid)
print('ORPHA')
for pmid in orpha:
    if str(pmid) in search_results.keys():
        print(pmid)

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['muscular dystrophy, facioscapulohumeral, type 1a', 'facioscapulohumeral muscular dystrophy 1a', 'muscular dystrophy, facioscapulohumeral', 'facioscapulohumeral muscular dystrophy', 'landouzy-dejerine muscular dystrophy', 'fshmd1a', 'fshd1a']
Found 100 PMIDs. Gathered 91 Relevant Abstracts.
MANUAL
ORPHA


In [11]:
search_results = search_getAbs(autosearch(6667, GARD_dict), 60, 'none')
#These four were found in a manual search of pubmed for all of the disease names and synonyms
manual = {32232970, 9819703, 12889665, 19914636}
#These were all the pmids cited in orphanet for this disease
orpha = {10328723, 9587032, 9587032, 15192637, 10328723, 646432, 19914636, 9587032}

print('MANUAL')
for pmid in manual:
    if str(pmid) in search_results.keys():
        print(pmid)
print('ORPHA')
for pmid in orpha:
    if str(pmid) in search_results.keys():
        print(pmid)

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['homocystinuria due to cystathionine beta-synthase deficiency', 'cystathionine beta-synthase deficiency', 'homocystinuria due to cbs deficiency', 'classic homocystinuria', 'cbs deficiency']
Found 60 PMIDs. Gathered 58 Relevant Abstracts.
MANUAL
ORPHA


In [12]:
search_results = search_getAbs(autosearch('GARD:0007383', GARD_dict), 40, 'none')
#These seven were found in a manual search of pubmed for all of the disease names and synonyms
manual = {21659675, 32742934, 33161754, 21555948, 32128737, 16550150, 34017006}
#These were all the pmids cited in orphanet for this disease
orpha = {11641035, 17616847, 11641035, 20971365, 16690699, 9254847, 8825928, 11581453, 21659675, 22766612, 11581453, 21659675, 
17701285, 19068582, 10617747, 21659675, 20971365, 6468444, 20971365, 23430943, 19718537, 15906732, 23430943, 19718537, 15906732, 
15906730, 11432505, 15751925}
print('MANUAL')
for pmid in manual:
    if str(pmid) in search_results.keys():
        print(pmid)
print('ORPHA')
for pmid in orpha:
    if str(pmid) in search_results.keys():
        print(pmid)

SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR:  ['phenylalanine hydroxylase deficiency', 'oligophrenia phenylpyruvica', 'phenylketonuria', 'folling disease']
Found 40 PMIDs. Gathered 37 Relevant Abstracts.
MANUAL
ORPHA
