### Goal:
Create figures that demonstrates utility of transformers model

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import requests
import xml.etree.ElementTree as ET
import re

from extract_abs import init_NER_pipeline, load_GARD_diseases, PMID_extraction

In [None]:
NER_pipeline, entities = init_NER_pipeline()
GARD_dict, max_length = load_GARD_diseases()

In [None]:
def get_title(pmid):
    url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(pmid)+'&resulttype=core'
    r = requests.get(url)
    root = ET.fromstring(r.content)
    for child in root.iter('*'):
        if child.tag == 'title':
            title = re.sub('\[', '', child.text)
            title = re.sub('\]', '', title)
            return title

In [None]:
pmids = [16001099,2803793,34449519,34272836,10196704,22633354,2309705,3472423,25211237,31294928]#, , 
#11694544, 6693042, 34315378, 10820168

In [None]:
titles = [get_title(pmid) for pmid in pmids]

In [None]:
GARD_IDs = []
Diseases = []
Locations = []
Epis = []
Stats = []

for pmid in pmids:
    output_dict = PMID_extraction(pmid, NER_pipeline, entities, GARD_dict, max_length)
    GARD_IDs.append(output_dict['IDS'])
    Diseases.append(output_dict['DIS'])
    Locations.append(output_dict['LOC'])
    Epis.append(output_dict['EPI'])
    Stats.append(output_dict['EPI'])
    break

In [None]:
df = pd.DataFrame.from_dict({'PMID':pmids,'Titles':titles,'GARD IDs':GARD_IDs,'Diseases':Diseases,'Locations':Locations,'Epi Identifiers':Epis,'Epi Rates':Stats})

In [None]:
df

In [None]:
df.to_csv('Model Output Picture.csv')

Comparison to Orphanet

In [None]:
#This file was downloaded on August 31, 2021. See README.md for details
tree = ET.parse('en_product9_prev.xml')
root = tree.getroot()

In [None]:
orpha_pmids = [17994169,19542096,21750884,21782147,17245510,11701258]

In [None]:
orpha_titles = [get_title(pmid) for pmid in orpha_pmids]

In [None]:
Model_Diseases = []
Model_Locations = []
Model_Epis = []
Model_Stats = []

for pmid in orpha_pmids:
    text, ab_ids, ab_dis, ab_locs, ab_epis, ab_stats = PMID_extraction(pmid, NER_pipeline, GARD_dict, max_length)
    if len(ab_locs) == 0:
        ab_locs.update(['Worldwide'])
    Model_Diseases.append(', '.join(list(ab_dis)))
    Model_Locations.append(', '.join(list(ab_locs)))
    Model_Epis.append(', '.join(list(ab_epis)))
    Model_Stats.append(', '.join(list(ab_stats)))

In [None]:
def get_orpha_entry(pmid):
    tree = ET.parse('en_product9_prev.xml')
    root = tree.getroot()
    for disorder in root.iter('Disorder'):
        name = disorder.find('./Name').text
        #Each disorder, w/code and name, has multiple prevalence branches
        for prevalence in disorder.findall('./PrevalenceList/Prevalence'):
            EPtype = prevalence.find('./PrevalenceType/Name').text
            if 'class' in prevalence.find('./PrevalenceQualification/Name').text.lower():
                EPclss = prevalence.find('./PrevalenceClass/Name').text
            else:
                EPclss = ''
            EPrate = prevalence.find('./ValMoy').text
            geoloc = prevalence.find('./PrevalenceGeographic/Name').text
            source = prevalence.find('./Source').text
            #each prevalence, w/geoloc and source, has multiple pmids w/abstracts
            if 'PMID' in str(source) and 'EXPERT' not in str(source) and len(EPclss)>1:
                articleids = re.findall('\d{6,8}', source)
                for articleid in articleids:
                    if articleid == str(pmid):
                        return name, EPtype, geoloc, EPclss, EPrate
    raise ValueError('Not Found',pmid)

In [None]:
Orpha_Diseases = []
Orpha_Locations = []
Orpha_Epi_Type = []
Orpha_Classes = []
Orpha_Stats = []

for pmid in orpha_pmids:
    name, EPtype, geoloc, EPclss, EPrate = get_orpha_entry(pmid)
    Orpha_Diseases.append(name)
    Orpha_Locations.append(geoloc)
    Orpha_Epi_Type.append(EPtype)
    Orpha_Classes.append(EPclss)
    Orpha_Stats.append(EPrate)

In [None]:
orpha_dict = {'PMID':orpha_pmids,
              'Title':orpha_titles,
              'Orphanet Disease':Orpha_Diseases,
              'Orphanet Location':Orpha_Locations,
              'Orphanet Epi Type':Orpha_Epi_Type,
              'Orphanet Epi Class':Orpha_Classes,
              'Orphanet Epi Rate':Orpha_Stats,
              'EIEP Disease':Model_Diseases,
              'EIEP Location':Model_Locations,
              'EIEP Epi Identifier':Model_Epis,
              'EIEP Epi Rate':Model_Stats
             }

In [None]:
orpha_compare_df = pd.DataFrame.from_dict(orpha_dict)

In [None]:
orpha_compare_df

In [None]:
orpha_compare_df.to_csv('Orphanet Comparison Figure.csv')

### Create the Case Study Figures

Case Study #1

In [2]:
cs1 = pd.read_csv('case_study/CHARGE-syndrome.csv')
cs1

Unnamed: 0,PMID,ABSTRACT,EPI_PROB,IsEpi,IDS,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,16959034,"CHARGE syndrome. CHARGE syndrome was initially defined as a non-random association of anomalies (Coloboma, Heart defect, Atresia choanae, Retarded growth and development, Genital hypoplasia, Ear anomalies/deafness). In 1998, an expert group defined the major (the classical 4C's: Choanal atresia, Coloboma, Characteristic ears and Cranial nerve anomalies) and minor criteria of CHARGE syndrome. Individuals with all four major characteristics or three major and three minor characteristics are highly likely to have CHARGE syndrome. However, there have been individuals genetically identified with CHARGE syndrome without the classical choanal atresia and coloboma. The reported incidence of CHARGE syndrome ranges from 0.1-1.2/10,000 and depends on professional recognition. Coloboma mainly affects the retina. Major and minor congenital heart defects (the commonest cyanotic heart defect is tetralogy of Fallot) occur in 75-80% of patients. Choanal atresia may be membranous or bony; bilateral or unilateral. Mental retardation is variable with intelligence quotients (IQ) ranging from normal to profound retardation. Under-development of the external genitalia is a common finding in males but it is less apparent in females. Ear abnormalities include a classical finding of unusually shaped ears and hearing loss (conductive and/or nerve deafness that ranges from mild to severe deafness). Multiple cranial nerve dysfunctions are common. A behavioral phenotype for CHARGE syndrome is emerging. Mutations in the CHD7 gene (member of the chromodomain helicase DNA protein family) are detected in over 75% of patients with CHARGE syndrome. Children with CHARGE syndrome require intensive medical management as well as numerous surgical interventions. They also need multidisciplinary follow up. Some of the hidden issues of CHARGE syndrome are often forgotten, one being the feeding adaptation of these children, which needs an early aggressive approach from a feeding team. As the child develops, challenging behaviors become more common and require adaptation of educational and therapeutic services, including behavioral and pharmacological interventions.",0.849687,True,"['GARD:0000029', 'GARD:0002245', 'GARD:0010559']","['charge syndrome', 'tetralogy of fallot', 'child']",['incidence'],"['0 . 1 - 1 . 2 / 10 , 000']",,['1998'],"['males', 'females']",
1,18552902,"[The CHARGE syndrome]. <h4>Background</h4>CHARGE syndrome is a rare congenital condition with multiple malformations. The acronym CHARGE summarizes six cardinal features: Coloboma, Heart defect, Atresia choanae, Retarded growth and development, Genital anomalies and Ear anomalies/deafness. Our aim is to present an update on clinical presentation, genetics and behavioural aspects in the CHARGE syndrome. Furthermore, we give recommendations regarding multidisciplinary management.<h4>Material and method</h4>The article is based on selected references retrieved from PubMed and the authors' own experience in following this patient group.<h4>Results and interpretation</h4>The CHARGE syndrome has an estimated incidence of 1 : 10 000. About 60 % of the patients have mutations in a recently characterized gene ( CHD7: ). C: oloboma, C: hoanal atresia and abnormal semicircular C:anals (3C-triad) are the most specific malformations. Serious cardiovascular and respiratory tract malformations also occur frequently and may be life-threatening, especially in the first year of life. Multiple cranial nerve dysfunctions affect sense of smell, swallowing, facial palsy and sensorineural hearing loss. CHARGE syndrome is recognized as one of the most common causes of dual sensory impairment (vision and hearing). Mental retardation is common, but a substantial group of patients only have limited intellectual impairment. Some patients have a distinct behavioural profile and specific cognitive problems. Coordinated multidisciplinary medical follow-up is needed. The combined sensory loss may render the rehabilitation offered for deaf and blind useful for these patients.",0.670445,True,"['GARD:0000029', 'GARD:0005906']","['charge syndrome', 'facial palsy']",['estimated incidence'],['1 : 10 000'],,,,


In [3]:
cs1 = cs1.fillna('None')
cs1 = cs1.drop(columns=['IsEpi'])

In [4]:
cs1.rename(columns={'ABSTRACT':'RELEVANT TEXT','IDS':'GARD IDs'}, inplace=True)

In [5]:
cs1['RELEVANT TEXT'] = ["“In 1998, an expert group defined the major (the classical 4C's: Choanal atresia, Coloboma, Characteristic ears and Cranial nerve anomalies) and minor criteria of CHARGE syndrome. … The reported incidence of CHARGE syndrome ranges from 0.1-1.2/10,000 and depends on professional recognition.”",
                       "The CHARGE syndrome has an estimated incidence of 1 : 10 000."]

In [6]:
cs1

Unnamed: 0,PMID,RELEVANT TEXT,EPI_PROB,GARD IDs,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,16959034,"“In 1998, an expert group defined the major (the classical 4C's: Choanal atresia, Coloboma, Characteristic ears and Cranial nerve anomalies) and minor criteria of CHARGE syndrome. … The reported incidence of CHARGE syndrome ranges from 0.1-1.2/10,000 and depends on professional recognition.”",0.849687,"['GARD:0000029', 'GARD:0002245', 'GARD:0010559']","['charge syndrome', 'tetralogy of fallot', 'child']",['incidence'],"['0 . 1 - 1 . 2 / 10 , 000']",,['1998'],"['males', 'females']",
1,18552902,The CHARGE syndrome has an estimated incidence of 1 : 10 000.,0.670445,"['GARD:0000029', 'GARD:0005906']","['charge syndrome', 'facial palsy']",['estimated incidence'],['1 : 10 000'],,,,


Case Study #2

In [7]:
cs2 = pd.read_csv('case_study/Bernard-Soulier-syndrome.csv')
cs2

Unnamed: 0,PMID,ABSTRACT,EPI_PROB,IsEpi,IDS,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,35157230,"Perforated hemorrhagic cholecystitis in a patient with Bernard-Soulier syndrome. Bernard-Soulier syndrome is an inherited coagulopathy, with an incidence of one per million. Hemorrhagic cholecystitis is a rare and life-threatening complication of acute cholecystitis. Less than 50 patients have been reported in the previous literature. Bleeding diathesis and anticoagulant treatment are well-known predisposing factors for hemorrhagic cholecystitis. We present a 57-year-old male patient who was referred to our department with a complaint of right upper quadrant abdominal pain. Contrast-enhanced computed tomography revealed a high-density mass associated with the gallbladder lumen, and blood clot in the gallbladder lumen and hemoperitoneum which were compatible for hemorrhagic cholecystitis and gallbladder perforation. The patient underwent urgent cholecystectomy. Hemorrhagic cholecystitis often manifests as typical acute cholecystitis presentation; but several clinical findings such as fever, lower gastrointestinal bleeding or severe intraabdominal bleeding-related hypovolemic shock may also occur. Most of the described cases in prior literature have been reported to use anticoagulant medications. This report describes the second hemorrhagic cholecystitis patient with inherited bleeding diathesis and the first case with Bernard-Soulier syndrome.",0.998013,True,"['GARD:0000030', 'GARD:0002470']","['cholecystitis', 'bernard-soulier syndrome']",['incidence'],['one per million'],,,['male'],
1,34333846,"The Copenhagen founder variant GP1BA c.58T>G is the most frequent cause of inherited thrombocytopenia in Denmark. <h4>Background</h4>The classic Bernard-Soulier syndrome (BSS) is a rare inherited thrombocytopenia (IT) associated with severe thrombocytopenia, giant platelets, and bleeding tendency caused by homozygous or compound heterozygous variants in GP1BA, GP1BB, or GP9. Monoallelic BSS (mBSS) associated with mild asymptomatic macrothrombocytopenia caused by heterozygous variants in GP1BA or GP1BB may be a frequent cause of mild IT.<h4>Objective</h4>We aimed to examine the frequency of mBSS in a consecutive cohort of patients with IT and to characterize the geno- and phenotype of mBSS probands and their family members. Additionally, we set out to examine if thrombopoietin (TPO) levels differ in mBSS patients.<h4>Patients/methods</h4>We screened 106 patients suspected of IT using whole exome- or whole genome sequencing and performed co-segregation analyses of mBSS families. All probands and family members were phenotypically characterized. Founder mutation analysis was carried out by certifying that the probands were unrelated and the region around the variant was shared by all patients. TPO was measured by solid phase sandwich ELISA.<h4>Results</h4>We diagnosed 14 patients (13%) with mBSS associated with heterozygous variants in GP1BA and GP1BB. Six unrelated probands carried a heterozygous variant in GP1BA (c.58T>G, p.Cys20Gly) and shared a 2.0 Mb region on chromosome 17, confirming that it is a founder variant. No discrepancy of TPO levels between mBSS patients and wild-type family members (P > .05) were identified.<h4>Conclusion</h4>We conclude that the most frequent form of IT in Denmark is mBSS caused by the Copenhagen founder variant.",0.996847,True,['GARD:0002470'],['bernard-soulier syndrome'],,,"['den', '##mark']",,,"['cope', '##nh', '##age']"
2,31789661,"A novel mutation in the GP1BA gene in Bernard-Soulier syndrome. : The Bernard-Soulier syndrome (BSS) is a rare disease with a prevalence of 1/1000 000; it is characterized by macrothrombocytopenia. BSS develops as a result of a defect in the glycoprotein GPIb-IX-V complex on the platelet surface. In this article, we present a pediatric patient with the novel mutation that has been identified for the first time in BSS. A 13-month-old male patient was admitted with severe thrombocytopenia unresponsive to intravenous immunoglobulin in the neonatal period and recurrent mucocutaneous bleeding which initiated at 5 months of age. glycoprotein (GP) IX (CD42a) expression was normal as per flow cytometry results. Genetic analysis revealed a homozygous c.243C>A (p.Cys81) (p.C81) mutation. This novel mutation identified by us presents with severe thrombocytopenia and normal GPIX (CD42a) expression and is mistaken for immune thrombocytopenia in the neonatal period. This mutation creates an early stop codon and possibly leads to loss of function of the receptor.",0.957938,True,"['GARD:0002470', 'GARD:0006768']","['bernard-soulier syndrome', 'immune thrombocytopenia']",['prevalence'],['1 / 1000 000'],,,['male'],
3,34878196,"Invasive procedures in the oral cavity of individuals with Bernard-Soulier syndrome: An integrative review. <h4>Aim</h4>Bernard-Soulier syndrome (BSS) is an inherited bleeding disorder characterized by macroplatelets and thrombocytopenia, prolonged bleeding time, and a prevalence of less than 1 in 1,000,000. In view of the recognition of the risk of bleeding and the management of daily surgical practice in these patients, adequate strategies are necessary to provide the safest care. This article aims to perform an integrative review of the literature on the management of invasive procedures in the oral cavity of individuals with BSS.<h4>Method</h4>The PubMed/Medline and LILACS databases were searched using Boolean operators related to BSS, bleeding disorders, and oral care.<h4>Results</h4>As a result, only five articles with the main theme were included: one letter to the editor and four case reports, described chronologically as to date of publication, classification of the article, and medical/odontological measures taken.<h4>Conclusion</h4>We conclude with this review the need for adequate knowledge of surgeons regarding coagulation disorders and the need to discuss and plan procedures with the hematology team, as well as the importance of the notion of management of possible complications resulting from invasive treatments in the oral cavity of patients with BSS.",0.891553,True,['GARD:0002470'],['bernard-soulier syndrome'],['prevalence'],"['less than 1 in 1 , 000 , 000']",,,,
4,33657022,"A homozygous loss-of-function mutation in GP1BB causing variable clinical phenotypes in a family with Bernard-Soulier syndrome. Bernard-Soulier syndrome is a rare autosomal recessive bleeding disorder and has a low incidence. Bernard-Soulier syndrome is caused by the deficiency of glycoprotein GPIb-V-IX complex, a receptor for von Willebrand factor and is characterized by thrombocytopenia, giant platelets and bleeding tendency. We are reporting three members of a same family with variable phenotypic clinical presentation. The index case is a 20-year-old boy who has a frequent presentation with epistaxis, and low platelet counts (25 × 109/l). He had been hospitalized multiple times and received platelet transfusions. His brother and cousin reported bleeding symptoms with less frequent medical intervention. Genetic analysis by next-generation sequencing identified a homozygous GP1BB variant (c.423C>A:p.Cys141Ter), which segregated amongst the family members. The results led us to an improved insight into the disease for this family with variable phenotypic expression, in addition to the identification of a variant for further structural and functional characterization.",0.865511,True,['GARD:0002470'],['bernard-soulier syndrome'],['incidence'],,,,"['boy', 'brother', 'cousin']",
5,34125163,"Diagnostic Challenges in Children With Congenital Bleeding Disorders: A Developing Country Perspective. <h4>Objectives</h4>To assess the frequency and characteristics of children with inherited bleeding disorders that were initially misdiagnosed, leading to inappropriate disease management.<h4>Methods</h4>This study was conducted at the Haematology/Pathology Department of Fauji Foundation Hospital, Rawalpindi, Pakistan, from August 2014 to August 2018. Children who were diagnosed with an inherited bleeding disorder but did not respond to initial therapy were reevaluated.<h4>Results</h4>In total, 62 children were diagnosed with a bleeding disorder. Of these, 27 were diagnosed with an inherited bleeding disorder and 35 with an acquired bleeding disorder. Of the 27 children with inherited bleeding disorders, 18% (n = 5) were misdiagnosed and treated inappropriately. The median age of the misdiagnosed patients was 9 years (range, 5-13 years). Three patients with Bernard-Soulier syndrome had been misdiagnosed as having immune thrombocytopenic purpura, 1 patient with von Willebrand disease had been misdiagnosed as having hemophilia A, and 1 patient with haemophilia B had been misdiagnosed as having hemophilia A.<h4>Conclusions</h4>There are chances of misdiagnosis and improper or invasive management if comprehensive laboratory evaluation and a thorough clinical evaluation are not performed in children with congenital bleeding disorders.",0.777873,True,"['GARD:0002470', 'GARD:0006591']","['bernard-soulier syndrome', 'hemophilia a']",,,"['rawalpindi , pakistan']",['august 2014 to august 2018'],,


In [8]:
cs2 = cs2.fillna('None')
cs2 = cs2.drop(columns=['IsEpi'])

In [9]:
cs2.rename(columns={'ABSTRACT':'RELEVANT TEXT','IDS':'GARD IDs'}, inplace=True)

In [10]:
cs2['RELEVANT TEXT'] = ["Bernard-Soulier syndrome is an inherited coagulopathy, with an incidence of one per million.",
                       "The Copenhagen founder variant GP1BA c.58T>G is the most frequent cause of inherited thrombocytopenia in Denmark.",
                       "The Bernard-Soulier syndrome (BSS) is a rare disease with a prevalence of 1/1000 000",
                       "Bernard-Soulier syndrome (BSS) is an inherited bleeding disorder characterized by macroplatelets and thrombocytopenia, prolonged bleeding time, and a prevalence of less than 1 in 1,000,000.",
                       "Bernard-Soulier syndrome is a rare autosomal recessive bleeding disorder and has a low incidence. ...The index case is a 20-year-old boy who has a frequent presentation with epistaxis...His brother and cousin reported bleeding symptoms...",
                       "This study was conducted at the Haematology/Pathology Department of Fauji Foundation Hospital, Rawalpindi, Pakistan, from August 2014 to August 2018."]

In [11]:
cs2

Unnamed: 0,PMID,RELEVANT TEXT,EPI_PROB,GARD IDs,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,35157230,"Bernard-Soulier syndrome is an inherited coagulopathy, with an incidence of one per million.",0.998013,"['GARD:0000030', 'GARD:0002470']","['cholecystitis', 'bernard-soulier syndrome']",['incidence'],['one per million'],,,['male'],
1,34333846,The Copenhagen founder variant GP1BA c.58T>G is the most frequent cause of inherited thrombocytopenia in Denmark.,0.996847,['GARD:0002470'],['bernard-soulier syndrome'],,,"['den', '##mark']",,,"['cope', '##nh', '##age']"
2,31789661,The Bernard-Soulier syndrome (BSS) is a rare disease with a prevalence of 1/1000 000,0.957938,"['GARD:0002470', 'GARD:0006768']","['bernard-soulier syndrome', 'immune thrombocytopenia']",['prevalence'],['1 / 1000 000'],,,['male'],
3,34878196,"Bernard-Soulier syndrome (BSS) is an inherited bleeding disorder characterized by macroplatelets and thrombocytopenia, prolonged bleeding time, and a prevalence of less than 1 in 1,000,000.",0.891553,['GARD:0002470'],['bernard-soulier syndrome'],['prevalence'],"['less than 1 in 1 , 000 , 000']",,,,
4,33657022,Bernard-Soulier syndrome is a rare autosomal recessive bleeding disorder and has a low incidence. ...The index case is a 20-year-old boy who has a frequent presentation with epistaxis...His brother and cousin reported bleeding symptoms...,0.865511,['GARD:0002470'],['bernard-soulier syndrome'],['incidence'],,,,"['boy', 'brother', 'cousin']",
5,34125163,"This study was conducted at the Haematology/Pathology Department of Fauji Foundation Hospital, Rawalpindi, Pakistan, from August 2014 to August 2018.",0.777873,"['GARD:0002470', 'GARD:0006591']","['bernard-soulier syndrome', 'hemophilia a']",,,"['rawalpindi , pakistan']",['august 2014 to august 2018'],,


Case Study #3

In [12]:
cs3 = pd.read_csv('case_study/Retinitis-Pigmentosa.csv')
cs3

Unnamed: 0,PMID,ABSTRACT,EPI_PROB,IsEpi,IDS,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,35070660,"Metabolic rescue of cone photoreceptors in retinitis pigmentosa. Retinitis pigmentosa (RP) encompasses a group of inherited retinal dystrophies characterized by the primary degeneration of rod and cone photoreceptors. It is a leading cause of visual disability, with an incidence of ~1 in 7000 persons. Although most RP is nonsyndromic, 20%-30% of patients with RP also have an associated nonocular condition. The gene mutations responsible for RP occur overwhelmingly in rod photoreceptors. Visual loss frequently begins with night blindness in adolescence, followed by concentric visual field loss, reflecting the principal dysfunction of rod photoreceptors. Although the visual disability from rod dysfunction is significant, it is the subsequent loss of central vision later in life due to cone degeneration that is catastrophic. Until recently, the reason for cone dysfunction in RP was unknown. However, it is now recognized that cones degenerate, losing outer segment (OS) synthesis and inner segment (IS) disassembly because of glucose starvation following rod demise. Rod OS phagocytosis by the apical microvilli of retinal pigment epithelium is necessary to transport glucose from the choriocapillaris to the subretinal space. Although cones lose OS with the onset of rod degeneration in RP, regardless of the gene mutation in rods, cone nuclei remain viable for years (i.e. enter cone dormancy) so that therapies aimed at reversing glucose starvation can prevent and/or recover cone function and central vision.",0.965624,True,"['GARD:0005694', 'GARD:0006825']","['retinitis pigmentosa', 'night blindness']",['incidence'],"['~', '1 in 7000 persons']",,,,
1,34962636,"Rhodopsin as a Molecular Target to Mitigate Retinitis Pigmentosa. Retinitis pigmentosa (RP) is a group of hereditary degenerative diseases affecting 1 of 4000 people worldwide and being the most prevalent cause of visual handicap among working populations in developed countries. These disorders are mainly related to the abnormalities in the rod G protein-coupled receptor (GPCR), rhodopsin reflected in the dysregulated membrane trafficking, stability and phototransduction processes that lead to progressive loss of retina function and eventually blindness. Currently, there is no cure for RP, and the therapeutic options are limited. Targeting rhodopsin with small molecule chaperones to improve the folding and stability of the mutant receptor is one of the most promising pharmacological approaches to alleviate the pathology of RP. This review provides an update on the current knowledge regarding small molecule compounds that have been evaluated as rhodopsin modulators to be considered as leads for the development of novel therapies for RP.",0.96341,True,['GARD:0005694'],['retinitis pigmentosa'],['prevalent'],['1 of 4000 people'],['worldwide'],,,
2,35205402,"Maternal Uniparental Isodisomy of Chromosome 4 and 8 in Patients with Retinal Dystrophy: <i>SRD5A3</i>-Congenital Disorders of Glycosylation and <i>RP1</i>-Related Retinitis Pigmentosa. <h4>Purpose</h4>Uniparental disomy (UPD) is a rare chromosomal abnormality. We performed whole-exosome sequencing (WES) in cases of early-onset retinal dystrophy and identified two cases likely caused by UPD. Herein, we report these two cases and attempt to clarify the clinical picture of retinal dystrophies caused by UPD.<h4>Methods</h4>WES analysis was performed for two patients and their parents, who were not consanguineous. Functional analysis was performed in cases suspected of congenital disorders of glycosylation (CDG). We obtained clinical case data and reviewed the literature.<h4>Results</h4>In case 1, a novel c.57G>C, p.(Trp19Cys) variant in <i>SRD5A3</i> was detected homozygously. Genetic analysis suggested a maternal UPD on chromosome 4, and functional analysis confirmed CDG. Clinical findings showed early-onset retinal dystrophy, intellectual disability, and epilepsy. In case 2, an Alu insertion (c.4052_4053ins328, p.[Tyr1352Alafs]) in <i>RP1</i> was detected homozygously. Maternal UPD on chromosome 8 was suspected. The clinical picture was consistent with <i>RP1</i>-related retinitis pigmentosa. Although the clinical features of retinal dystrophy by UPD may vary, most cases present with childhood onset.<h4>Conclusions</h4>There have been limited reports of retinal dystrophy caused by UPD, suggesting that it is rare. Genetic counseling may be encouraged in pediatric cases of retinal dystrophy.",0.890485,True,"['GARD:0005694', 'GARD:0010307']","['retinitis pigmentosa', 'congenital disorders of glycosylation']",,,,,,
3,34907125,"Retinitis Pigmentosa Sine Pigmento: Clinical Spectrum and Pigment Development. PURPOSE:To investigate the clinical findings, natural course, and pigment development of patients with retinitis pigmentosa (RP) sine pigmento using multimodal imaging. METHODS:We reviewed the medical records of 810 consecutive RP patients and assessed serial ultra-wide-field fundus photography, fundus autofluorescence (FAF), and optical coherence tomography images. Electrophysiological and visual field analysis findings were also reviewed. RESULTS:Of the 774 patients with RP who met the inclusion criteria, 88 were diagnosed with RP sine pigmento, with a prevalence of 11.4%. The mean age of the subjects was 35.57 years, compared to 49.83 years for typical RP patients. Fifty-nine (67%) patients demonstrated minimal color change, whereas 29 (33%) presented with grayish flecks in the retinal pigment epithelium on fundus photography. All patients with RP sine pigmento had abnormalities on FAF, and the commonest FAF findings were punctate or reticular hypoautofluorescence. Of the 62 patients without pigmentation at the first visit and had follow-up visits, 14 (22.6%) had developed pigmentation at their follow-up visit, with an average time of 3.92 years. Most patients retained a visual acuity of ≥20/50 within the age of 50 years. CONCLUSION:Diagnosing RP sine pigmento based solely on ophthalmoscopic findings is more difficult than in more typical cases. Multimodal imaging can provide insights into the clinical characteristics to facilitate the diagnosis, classification, and follow-up of patients.",0.564981,True,['GARD:0005694'],['retinitis pigmentosa'],['prevalence'],['11 . 4 %'],,,,


In [13]:
cs3 = cs3.fillna('None')
cs3 = cs3.drop(columns=['IsEpi'])

In [14]:
cs3.rename(columns={'ABSTRACT':'RELEVANT TEXT','IDS':'GARD IDs'}, inplace=True)

In [15]:
cs3['RELEVANT TEXT'] = ["It is a leading cause of visual disability, with an incidence of ~1 in 7000 persons.",
                       "Retinitis pigmentosa (RP) is a group of hereditary degenerative diseases affecting 1 of 4000 people worldwide...",
                       "Functional analysis was performed in cases suspected of congenitaldisorders of glycosylation (CDG). ... The clinical picture was consistent with RP1-related retinitis pigmentosa.",
                       "Of the 774 patients with RP who met the inclusion criteria, 88 were diagnosed with RP sine pigmento, with a prevalence of 11.4%."]

In [16]:
cs3

Unnamed: 0,PMID,RELEVANT TEXT,EPI_PROB,GARD IDs,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,35070660,"It is a leading cause of visual disability, with an incidence of ~1 in 7000 persons.",0.965624,"['GARD:0005694', 'GARD:0006825']","['retinitis pigmentosa', 'night blindness']",['incidence'],"['~', '1 in 7000 persons']",,,,
1,34962636,Retinitis pigmentosa (RP) is a group of hereditary degenerative diseases affecting 1 of 4000 people worldwide...,0.96341,['GARD:0005694'],['retinitis pigmentosa'],['prevalent'],['1 of 4000 people'],['worldwide'],,,
2,35205402,Functional analysis was performed in cases suspected of congenitaldisorders of glycosylation (CDG). ... The clinical picture was consistent with RP1-related retinitis pigmentosa.,0.890485,"['GARD:0005694', 'GARD:0010307']","['retinitis pigmentosa', 'congenital disorders of glycosylation']",,,,,,
3,34907125,"Of the 774 patients with RP who met the inclusion criteria, 88 were diagnosed with RP sine pigmento, with a prevalence of 11.4%.",0.564981,['GARD:0005694'],['retinitis pigmentosa'],['prevalence'],['11 . 4 %'],,,,


Case Study #4

In [17]:
cs4 = pd.read_csv('case_study/Fellman-syndrome.csv')
cs4

Unnamed: 0,PMID,ABSTRACT,EPI_PROB,IsEpi,IDS,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,12547234,"The GRACILE syndrome, a neonatal lethal metabolic disorder with iron overload. GRACILE syndrome (Fellman syndrome, MIM 603358), an autosomal recessive metabolic disorder of the Finnish disease heritage, has been diagnosed in 25 infants of 18 families. The incidence is at least 1/47,000 in Finland. The main findings are fetal growth retardation, Fanconi type aminoaciduria, cholestasis, iron overload (liver hemosiderosis, hyperferritinemia, hypotransferrinemia, increased transferrin iron saturation, and free plasma iron), profound lactic acidosis, and early death. The pathophysiology of the metabolic disturbance is unsolved. No significant deficiency of complex III activity of respiratory chain has been found, although we recently showed that the underlying genetic cause is a missense mutation (S78G) in the BCS1L gene and other mutations in that gene have been associated with complex III deficiency. BCS1L encodes a mitochondrial protein, acting as a chaperone in the assembly of complex III. Iron accumulation in liver, a typical feature being less abundant with increasing age, might be a primary abnormality or a secondary phenomenon due to liver dysfunction. In order to decrease the iron overload, three infants have been repeatedly treated with apotransferrin followed by exchange transfusion. Improvement in iron biochemistry occurred, but no clear beneficial effect on the clinical condition was found. Further studies will elucidate the role of iron in the pathophysiology of the disease.",0.9975,True,"['GARD:0000001', 'GARD:0005794', 'GARD:0006595']","['gracile syndrome', 'fellman syndrome', 'aminoaciduria', 'hemosiderosis']",['incidence'],"['least', '1 / 47 , 000']",['finland'],,,['fin']


In [18]:
cs4 = cs4.fillna('None')
cs4 = cs4.drop(columns=['IsEpi'])

In [19]:
cs4.rename(columns={'ABSTRACT':'RELEVANT TEXT','IDS':'GARD IDs'}, inplace=True)

In [20]:
cs4['RELEVANT TEXT'] = ["The incidence is at least 1/47,000 in Finland. ... The main findings are fetal growth retardation, Fanconi type aminoaciduria, cholestasis, iron overload (liver hemosiderosis, hyperferritinemia, hypotransferrinemia, increased transferrin iron saturation, and free plasma iron)",]

In [21]:
cs4

Unnamed: 0,PMID,RELEVANT TEXT,EPI_PROB,GARD IDs,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,12547234,"The incidence is at least 1/47,000 in Finland. ... The main findings are fetal growth retardation, Fanconi type aminoaciduria, cholestasis, iron overload (liver hemosiderosis, hyperferritinemia, hypotransferrinemia, increased transferrin iron saturation, and free plasma iron)",0.9975,"['GARD:0000001', 'GARD:0005794', 'GARD:0006595']","['gracile syndrome', 'fellman syndrome', 'aminoaciduria', 'hemosiderosis']",['incidence'],"['least', '1 / 47 , 000']",['finland'],,,['fin']


Case Study #5

In [None]:
#Change the display just for this to take a better picture
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [22]:
cs5 = pd.read_csv('case_study/Facioscapulohumeral-muscular-dystrophy.csv')
cs5

Unnamed: 0,PMID,ABSTRACT,EPI_PROB,IsEpi,IDS,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,34542603,"Predictors of functional outcomes in patients with facioscapulohumeral muscular dystrophy. Facioscapulohumeral muscular dystrophy (FSHD) is one of the most prevalent muscular dystrophies characterized by considerable variability in severity, rates of progression and functional outcomes. Few studies follow FSHD cohorts long enough to understand predictors of disease progression and functional outcomes, creating gaps in our understanding, which impacts clinical care and the design of clinical trials. Efforts to identify molecularly targeted therapies create a need to better understand disease characteristics with predictive value to help refine clinical trial strategies and understand trial outcomes. Here we analysed a prospective cohort from a large, longitudinally followed registry of patients with FSHD in the USA to determine predictors of outcomes such as need for wheelchair use. This study analysed de-identified data from 578 individuals with confirmed FSHD type 1 enrolled in the United States National Registry for FSHD Patients and Family members. Data were collected from January 2002 to September 2019 and included an average of 9 years (range 0-18) of follow-up surveys. Data were analysed using descriptive epidemiological techniques, and risk of wheelchair use was determined using Cox proportional hazards models. Supervised machine learning analysis was completed using Random Forest modelling and included all 189 unique features collected from registry questionnaires. A separate medications-only model was created that included 359 unique medications reported by participants. Here we show that smaller allele sizes were predictive of earlier age at onset, diagnosis and likelihood of wheelchair use. Additionally, we show that females were more likely overall to progress to wheelchair use and at a faster rate as compared to males, independent of genetics. Use of machine learning models that included all reported clinical features showed that the effect of allele size on progression to wheelchair use is small compared to disease duration, which may be important to consider in trial design. Medical comorbidities and medication use add to the risk for need for wheelchair dependence, raising the possibility for better medical management impacting outcomes in FSHD. The findings in this study will require further validation in additional, larger datasets but could have implications for clinical care, and inclusion criteria for future clinical trials in FSHD.",0.994118,True,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],['prevalent'],,"['us', 'united states']",['january 2002 to september 2019'],"['females', 'males']",
1,35024656,"Prevalence and disease progression of genetically-confirmed facioscapulohumeral muscular dystrophy type 1 (FSHD1) in China between 2001 and 2020: a nationwide population-based study. <h4>Background</h4>Facioscapulohumeral muscular dystrophy type 1 (FSHD1) is a rare disease, which is often underdiagnosed due to its heterogeneous presentations and complex molecular genetic basis, leading to a lack of population-based epidemiology data, especially of prevalence and disease progression.<h4>Methods</h4>Fujian Neuromedical Centre (FNMC) is a diagnosis centre for clinical-genetic FSHD in China, and the only one employing pulsed-field gel electrophoresis (PFGE)-based Southern blotting for all FSHD1 genetic tests. Three sources distributed across all six spatial zones in China, were used to obtain information regarding FSHD1 events, namely, FNMC, Genetic and Myopathy Group (branches of the Neurology Society of the Chinese Medical Association), and ""FSHD-China"" (an organization supported by FSHD patients). During 2001-2020, all genetically-confirmed FSHD1 from China were registered in FNMC. Follow-up was conducted in the 20-year period to obtain data on disease progression, which was mainly described in terms of independent ambulation loss.<h4>Findings</h4>Of the 1,744 FSHD1 genetic tests (total test number 1,802) included in the analysis, 997 (57.2%) patients from 620 families were diagnosed with FSHD1. The estimated prevalence of genetically-confirmed FSHD1 in China is 0.75 per million (95% confidence interval [CI], 0.70-0.79) during 2001-2020, with 0.78 (95% CI, 0.72-0.85) in males and 0.71 (95% CI, 0.65-0.78) in females. The estimated prevalence increased from 0.22 (95% CI, 0.19-0.26) per million in 2001-2015 to 0.53 (95% CI, 0.49-0.57) per million in 2016-2020 (<i>p</i> < 0.001). The prevalence in Fujian province was 7.10 per million, 4.66 per million, and 2.44 per million, during 2001-2020, 2001-2015, and 2016-2020, respectively. Among the 861 symptomatic plus asymptomatic patients of the total 997 patients, the median onset age at first-ever muscle weakness was 16 years of age (range 1-81); the median number of contracted D4Z4 repeats was 5 units (range 1-9); the median 4qA-allele-specific methylation level was 41% (range 14%-69%). Of the 977 symptomatic patients followed-up during 2001-2020, 117 patients (12.0%) lost independent ambulation. The expected duration from onset of first-ever muscle weakness to onset of independent ambulation loss was 40 years. The group with loss of independent ambulation had a smaller number of contracted D4Z4 repeats (<i>p</i> < 0.001) and had an earlier onset age of first-ever muscle weakness (<i>p</i> < 0.001) compared to the group without loss of independent ambulation.<h4>Interpretation</h4>Our research captures the largest genetically-confirmed FSHD1 population worldwide, to calculate its prevalence of 0.75 per million in China from 2001 to 2020. Approximately 12.0% of symptomatic plus asymptomatic patients of FSHD1 will lose independent ambulation in 40 years from onset of first-ever muscle weakness.<h4>Funding</h4>This work has been supported by the grants (U2005201, 81870902, N.W.) and (81974193, 81671237, Z.Q.W.) from the National Natural Science Foundation of China; Joint Funds for the Innovation of Science and Technology of Fujian Province (2018Y9082) (N.W.), and the Key Clinical Specialty Discipline Construction Program of Fujian (N.W.).",0.992254,True,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],"['prevalence', 'estimated prevalence']","['0 . 75 per million', '0 . 78', 'in', '0 . 71', '0 . 22', 'per million', '0 . 53', 'per million in', '7 . 10 per million , 4 . 66 per million , and 2 . 44 per million']","['chin', '##a', 'f', '##ujian province', 'worldwide', 'fujian province']","['between 2001 and 2020', '2001 - 2020', '2001 - 2015', '2016 - 2020', '2001 to 2020', '2018']","['males', 'females']",['chin']
2,34315378,"Promising Perspective to Facioscapulohumeral Muscular Dystrophy Treatment: Nutraceuticals and Phytochemicals. Facioscapulohumeral muscular dystrophy (FSHD) is in the top three list of all dystrophies with an approximate 1:8000 incidence. It is not a life-threatening disease; however, progression of the disease extends over being wheel-chair bound. Despite some drug trials have been continuing, including DUX4 inhibition, TGF-ß inhibition and resokine which promote healthier muscle, there is not an applicable treatment option for FSHD today. Still, there is a need for new agent or agents to heal, to stop or at least to slow down the muscle wasting. Current FSHD studies with nutraceuticals as vitamin C, vitamin E, coenzyme Q10, zinc, selenium, and phytochemicals as curcumin or genistein, daidzein flavonoids provide promising treatment strategies. In this review we will present the clinical and molecular nature of FSHD and focus on nutraceuticals and phytochemicals that may alleviate FSHD. Via interconnection of impaired pathophysiological FSHD pathways together with nutraceuticals and phytochemicals in the light of literature, we present both studied and novel approaches that can contribute FSHD treatment.",0.92319,True,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],['incidence'],['approximate 1 : 8000'],,,,


In [23]:
cs5 = cs5.fillna('None')
cs5 = cs5.drop(columns=['IsEpi'])

In [24]:
cs5.rename(columns={'ABSTRACT':'RELEVANT TEXT','IDS':'GARD IDs'}, inplace=True)

In [25]:
cs5['RELEVANT TEXT'] = ["Facioscapulohumeral muscular dystrophy (FSHD) is one of the most prevalent muscular dystrophies ... This study analysed de-identified data from 578 individuals with confirmed FSHD type 1 enrolled in the United States National Registry for FSHD Patients and Family members. Data were collected from January 2002 to September 2019 ... Additionally, we show that females were more likely overall to progress to wheelchair use and at a faster rate as compared to males, independent of genetics.",
                       "The estimated prevalence of genetically-confirmed FSHD1 in China is 0.75 per million (95% confidence interval [CI], 0.70-0.79) during 2001-2020, with 0.78 (95% CI, 0.72-0.85) in males and 0.71 (95% CI, 0.65-0.78) in females. The estimated prevalence increased from 0.22 (95% CI, 0.19-0.26) per million in 2001-2015 to 0.53 (95% CI, 0.49-0.57) per million in 2016-2020 (p < 0.001). The prevalence in Fujian province was 7.10 per million, 4.66 per million, and 2.44 per million, during 2001-2020, 2001-2015, and 2016-2020, respectively.",
                       "Facioscapulohumeral muscular dystrophy (FSHD) is in the top three list of all dystrophies with an approximate 1:8000 incidence."]

In [26]:
cs5

Unnamed: 0,PMID,RELEVANT TEXT,EPI_PROB,GARD IDs,DIS,EPI,STAT,LOC,DATE,SEX,ETHN
0,34542603,"Facioscapulohumeral muscular dystrophy (FSHD) is one of the most prevalent muscular dystrophies ... This study analysed de-identified data from 578 individuals with confirmed FSHD type 1 enrolled in the United States National Registry for FSHD Patients and Family members. Data were collected from January 2002 to September 2019 ... Additionally, we show that females were more likely overall to progress to wheelchair use and at a faster rate as compared to males, independent of genetics.",0.994118,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],['prevalent'],,"['us', 'united states']",['january 2002 to september 2019'],"['females', 'males']",
1,35024656,"The estimated prevalence of genetically-confirmed FSHD1 in China is 0.75 per million (95% confidence interval [CI], 0.70-0.79) during 2001-2020, with 0.78 (95% CI, 0.72-0.85) in males and 0.71 (95% CI, 0.65-0.78) in females. The estimated prevalence increased from 0.22 (95% CI, 0.19-0.26) per million in 2001-2015 to 0.53 (95% CI, 0.49-0.57) per million in 2016-2020 (p < 0.001). The prevalence in Fujian province was 7.10 per million, 4.66 per million, and 2.44 per million, during 2001-2020, 2001-2015, and 2016-2020, respectively.",0.992254,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],"['prevalence', 'estimated prevalence']","['0 . 75 per million', '0 . 78', 'in', '0 . 71', '0 . 22', 'per million', '0 . 53', 'per million in', '7 . 10 per million , 4 . 66 per million , and 2 . 44 per million']","['chin', '##a', 'f', '##ujian province', 'worldwide', 'fujian province']","['between 2001 and 2020', '2001 - 2020', '2001 - 2015', '2016 - 2020', '2001 to 2020', '2018']","['males', 'females']",['chin']
2,34315378,Facioscapulohumeral muscular dystrophy (FSHD) is in the top three list of all dystrophies with an approximate 1:8000 incidence.,0.92319,['GARD:0009941'],['facioscapulohumeral muscular dystrophy'],['incidence'],['approximate 1 : 8000'],,,,
