# Comparing Model to Orphanet

Goal: Compare the output of my model to Orphanet's data
Need to make comparisons between the output of my model on multiple levels
 - Disease NameGARD_ID
 - Prevalence Type
 - Prevalence Class
   - Not if Source is Expert
 - Location  

Which also requires me to 
1. Input the Orphanet Data
2. XX Write a PMC abstract getting function (EBI API) (Tried, did not work, EBI API and PMC API dont have most full text articles) XX
3. Input the model
4. Default no location to Worldwide
5. Make Predictions
6. Save Predictions

### Input Orphanet Data

In [2]:
import xml.etree.ElementTree as ET
import re
import pandas as pd
import classify_abs
import extract_abs
import time

In [None]:
#This file was downloaded on August 31, 2021. See README.md for details
tree = ET.parse('en_product9_prev.xml')
root = tree.getroot()

In [None]:
df = pd.DataFrame(columns=['OrphaCode',
                           'Disease Name',
                           'Orpha Epi Type',
                           'Orpha Epi Class',
                           'Orpha Epi Rate',
                           'Orpha Loc', 
                           'PMID',#Above here is orphanet (will show to the left)
                           'Title+Abstract', #Below here is my model (will show to the right)
                           'GARD Disease ID',
                           'Pipeline Disease',
                           'Epi Identifier',
                           'Epi Statistics',
                           'Model Location',
                           'Model Date',
                           'Model Sex',
                           'Model Ethnicity'])

In [None]:
NER_pipeline, labels = extract_abs.init_NER_pipeline()
GARD_dict, max_length = extract_abs.load_GARD_diseases()

Parse through the entire Orphanet Database

In [None]:
i=0
pmid_abs = {}
pmid_extract = {}
print(i,time.ctime(time.time()-18000))
exit =False
for disorder in root.iter('Disorder'):  
    code = disorder.find('./OrphaCode').text
    name = disorder.find('./Name').text
    #Each disorder, w/code and name, has multiple prevalence branches
    for prevalence in disorder.findall('./PrevalenceList/Prevalence'):
        EPtype = prevalence.find('./PrevalenceType/Name').text
        if 'class' in prevalence.find('./PrevalenceQualification/Name').text.lower():
            EPclss = prevalence.find('./PrevalenceClass/Name').text
        else:
            EPclss = ''
        EPrate = prevalence.find('./ValMoy').text
        geoloc = prevalence.find('./PrevalenceGeographic/Name').text
        source = prevalence.find('./Source').text
        #each prevalence, w/geoloc and source, has multiple pmids w/abstracts
        if 'PMID' in str(source) and 'EXPERT' not in str(source) and len(EPclss)>1:
            pmids = re.findall('\d{6,8}', source)
            for articleid in pmids:
                pmid = articleid
                if pmid not in pmid_abs.keys():
                    pmid_abs[pmid] = classify_abs.PMID_getAb(articleid)
                    
                #pmid_abs[pmid] is the current abstract, this speeds up the EBI API so it does not keep getting duplicate abstracts
                abstract = pmid_abs[pmid]
                if len(abstract)>5:
                    if pmid not in pmid_extract.keys():
                        extraction = extract_abs.abstract_extraction(abstract, NER_pipeline, labels, GARD_dict, max_length)
                        if len(extraction['LOC']) == 0:
                            extraction['LOC'].update(['worldwide'])
                        pmid_extract[pmid] = extraction
                    else:
                        #pmid_extract[pmid] is the current extraction, this speeds up process so the extraction model does not keep working on duplicate abstracts
                        extraction = pmid_extract[pmid]
                    #Note: there are duplicate PMIDs next to each other in the dataset, but keeping in case orphanet has differen extraction data
                    df = df.append({'OrphaCode':code,
                                    'Disease Name':name,
                                    'Orpha Epi Type':EPtype,
                                    'Orpha Epi Class':EPclss,
                                    'Orpha Epi Rate':EPrate,
                                    'Orpha Loc':geoloc,
                                    'PMID':pmid,
                                    'Title+Abstract':abstract,
                                    'GARD Disease ID':extraction['IDS'],
                                    'Pipeline Disease':extraction['DIS'],
                                    'Epi Identifier':extraction['EPI'],
                                    'Epi Statistics':extraction['STAT'],
                                    'Model Location':extraction['LOC'],
                                    'Model Date':extraction['DATE'],
                                    'Model Ethnicity':extraction['ETHN'],
                                    'Model Sex':extraction['SEX']}
                                   , ignore_index=True)
                i+=1
                if i%500==0:
                    print(i,time.ctime(time.time()-18000))

In [None]:
df

In [None]:
#Not going to remove the duplicate PMIDs because Orphanet has different stat, disease, loc, info for each duplicated entry.
df.to_csv('Orphanet-Comparison-FINAL.csv', header = True,index=False, encoding='utf-8')
print('DONE')