In [None]:
!pip install requests



In [None]:
import xml.etree.ElementTree as ET
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_test_file_bases(directory):
    test_file_bases = set()
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            base_name = os.path.splitext(filename)[0]
            test_file_bases.add(base_name)
    return test_file_bases

In [None]:
test_directory = "/content/drive/MyDrive/test"

test_file_bases = get_test_file_bases(test_directory)

In [None]:
def exclude_test_files(xml_directory, test_file_bases):
    remaining_files = []
    remaining_base_name = []
    for filename in os.listdir(xml_directory):
        if filename.endswith(".xml"):
            base_name = os.path.splitext(filename)[0].rsplit(".", 2)[0]
            if base_name not in test_file_bases:
                remaining_files.append(filename)
                remaining_base_name.append(base_name)
    return remaining_files, remaining_base_name

In [None]:
uberon_xml_directory = "/content/drive/MyDrive/uberon_xml"

remaining_xml_files, remaining_base_name = exclude_test_files(uberon_xml_directory, test_file_bases)
len(remaining_xml_files)

67

In [None]:
def append_extension(bases, extension=".xml"):
    return [base + extension for base in bases]

In [None]:
remaining_base_name = append_extension(remaining_base_name, extension=".xml")

In [None]:
def parse_xml_files(xml_directory, xml_files):
    train1_data = []
    train2_data = []

    for filename in xml_files:
        filepath = os.path.join(xml_directory, filename)
        try:
            tree = ET.parse(filepath)
            root = tree.getroot()

            annotations = root.findall('annotation')
            for annotation in annotations:
                mention_id = annotation.find('mention').attrib.get('id', '')

                annotator_element = annotation.find('annotator')
                if annotator_element is not None:
                    annotator_id = annotator_element.attrib.get('id', '')
                    annotator_name = annotator_element.text
                else:
                    annotator_id = ''
                    annotator_name = ''

                spans = annotation.findall('span')
                span_start_end = [(span.attrib.get('start', ''), span.attrib.get('end', '')) for span in spans]
                spanned_text = annotation.find('spannedText').text

                annotation_data = {
                    "mention_id": mention_id,
                    "annotator_id": annotator_id,
                    "annotator_name": annotator_name,
                    "span_start_end": span_start_end,
                    "spanned_text": spanned_text
                }

                train1_data.append(annotation_data)

            class_mentions = root.findall('classMention')
            for class_mention in class_mentions:
                class_mention_id = class_mention.attrib.get('id', '')
                mention_class = class_mention.find('mentionClass')
                mention_class_id = mention_class.attrib.get('id', '')
                mention_class_name = mention_class.text

                class_mention_data = {
                    "class_mention_id": class_mention_id,
                    "mention_class_id": mention_class_id,
                    "mention_class_name": mention_class_name
                }

                train2_data.append(class_mention_data)

        except ET.ParseError as e:
            print(f"Error parsing {filename}: {e}")

    return train1_data, train2_data


In [None]:
train1_data, train2_data = parse_xml_files(uberon_xml_directory, remaining_xml_files)


uberon1_df = pd.DataFrame(train1_data)
uberon2_df = pd.DataFrame(train2_data)

uberon_df = pd.merge(uberon1_df, uberon2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')


In [None]:
uberon_df.head()

Unnamed: 0,mention_id,annotator_id,annotator_name,span_start_end,spanned_text,class_mention_id,mention_class_id,mention_class_name
0,UBERON+extension_classes_Instance_20020,UBERON_Instance_20002,Nicole Vasilevsky,"[(2993, 3001)]",skeletal,UBERON+extension_classes_Instance_20020,UBERON:0004288,skeleton
1,UBERON_Instance_100000,UBERON_Instance_20003,Michael Bada,"[(2853, 2860)]",tissues,UBERON_Instance_100000,UBERON:0000479,tissue
2,UBERON_Instance_100006,UBERON_Instance_20003,Michael Bada,"[(15965, 15979)]",haematological,UBERON_Instance_100006,UBERON:0000178,blood
3,UBERON_Instance_100010,UBERON_Instance_20003,Michael Bada,"[(27389, 27412)]",right cardiac ventricle,UBERON_Instance_100010,UBERON:0002080,heart right ventricle
4,UBERON_Instance_100012,UBERON_Instance_20003,Michael Bada,"[(27455, 27463)]",cervical,UBERON_Instance_100012,UBERON:0005434,cervical region


In [None]:
uberon_class_counts = uberon_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_uberon = uberon_class_counts['count'].sum()

uberon_class_counts['proportion'] = uberon_class_counts['count'] / total_count_uberon

In [None]:
total_count_uberon

12269

In [None]:
uberon_class_counts

Unnamed: 0,mention_class_id,mention_class_name,count,proportion
71,UBERON:0000922,embryo,1272,0.103676
65,UBERON:0000479,tissue,460,0.037493
87,UBERON:0000966,retina,447,0.036433
307,UBERON:0002048,lung,306,0.024941
333,UBERON:0002107,liver,302,0.024615
...,...,...,...,...
483,UBERON:0003481,tail vein,1,0.000082
482,UBERON:0003451,lower jaw incisor,1,0.000082
481,UBERON:0003450,upper jaw incisor,1,0.000082
478,UBERON:0003283,mesentery of oesophagus,1,0.000082


In [None]:
!git clone https://github.com/UCDenver-ccp/CRAFT.git

Cloning into 'CRAFT'...
remote: Enumerating objects: 17965, done.[K
remote: Counting objects: 100% (235/235), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 17965 (delta 152), reused 197 (delta 127), pack-reused 17730[K
Receiving objects: 100% (17965/17965), 258.71 MiB | 22.05 MiB/s, done.
Resolving deltas: 100% (15311/15311), done.
Updating files: 100% (3078/3078), done.


In [None]:
#chebi
chebi_dir = '/content/CRAFT/concept-annotation/CHEBI/CHEBI/knowtator'

train1_data, train2_data = parse_xml_files(chebi_dir, remaining_xml_files)

chebi1_df = pd.DataFrame(train1_data)
chebi2_df = pd.DataFrame(train2_data)

chebi_df = pd.merge(chebi1_df, chebi2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

chebi_class_counts = chebi_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_chebi = chebi_class_counts['count'].sum()

chebi_class_counts['proportion'] = chebi_class_counts['count'] / total_count_chebi

print(total_count_chebi)

chebi_class_counts

4548


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
14,CHEBI:15377,water,134,0.029464
29,CHEBI:15889,sterol,123,0.027045
35,CHEBI:16113,cholesterol,114,0.025066
392,CHEBI:60425,amyloid fibril,105,0.023087
237,CHEBI:33290,food,104,0.022867
...,...,...,...,...
243,CHEBI:33848,polycyclic arene,1,0.000220
252,CHEBI:35204,tracer,1,0.000220
255,CHEBI:35225,buffer,1,0.000220
257,CHEBI:35341,steroid,1,0.000220


In [None]:
#cl
cl_dir = '/content/CRAFT/concept-annotation/CL/CL/knowtator'

train1_data, train2_data = parse_xml_files(cl_dir, remaining_xml_files)

cl1_df = pd.DataFrame(train1_data)
cl2_df = pd.DataFrame(train2_data)

cl_df = pd.merge(cl1_df, cl2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

cl_class_counts = cl_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_cl = cl_class_counts['count'].sum()

cl_class_counts['proportion'] = cl_class_counts['count'] / total_count_cl

print(total_count_cl)

cl_class_counts


4043


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
179,CL:0002322,embryonic stem cell,456,0.112788
101,CL:0000540,neuron,343,0.084838
62,CL:0000210,photoreceptor cell,229,0.056641
109,CL:0000573,retinal cone cell,198,0.048974
117,CL:0000604,retinal rod cell,171,0.042295
...,...,...,...,...
36,CL:0000110,peptidergic neuron,1,0.000247
111,CL:0000581,peritoneal macrophage,1,0.000247
107,CL:0000558,reticulocyte,1,0.000247
106,CL:0000556,megakaryocyte,1,0.000247


In [None]:
#mop
mop_dir = '/content/CRAFT/concept-annotation/MOP/MOP/knowtator'

train1_data, train2_data = parse_xml_files(mop_dir, remaining_xml_files)

mop1_df = pd.DataFrame(train1_data)
mop2_df = pd.DataFrame(train2_data)

mop_df = pd.merge(mop1_df, mop2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

mop_class_counts = mop_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_mop = mop_class_counts['count'].sum()

mop_class_counts['proportion'] = mop_class_counts['count'] / total_count_mop

print(total_count_mop)

mop_class_counts

240


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
3,MOP:0000568,oxidation,66,0.275
11,MOP:0000779,formation of covalent bond,66,0.275
12,MOP:0000780,breaking of covalent bond,41,0.170833
1,MOP:0000093,biotinylation,15,0.0625
4,MOP:0000569,reduction,13,0.054167
0,MOP:0000030,acetylation,9,0.0375
8,MOP:0000629,polymerisation,8,0.033333
6,MOP:0000615,electron transfer,5,0.020833
5,MOP:0000590,dehydrogenation,4,0.016667
7,MOP:0000619,hydrolysis,3,0.0125


In [None]:
#NCBITaxon
NCBITaxon_dir = '/content/CRAFT/concept-annotation/NCBITaxon/NCBITaxon/knowtator'

train1_data, train2_data = parse_xml_files(NCBITaxon_dir, remaining_xml_files)

NCBITaxon1_df = pd.DataFrame(train1_data)
NCBITaxon2_df = pd.DataFrame(train2_data)

NCBITaxon_df = pd.merge(NCBITaxon1_df, NCBITaxon2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

NCBITaxon_class_counts = NCBITaxon_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_NCBITaxon = NCBITaxon_class_counts['count'].sum()

NCBITaxon_class_counts['proportion'] = NCBITaxon_class_counts['count'] / total_count_NCBITaxon

print(total_count_NCBITaxon)

NCBITaxon_class_counts

7362


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
2,NCBITaxon:10088,Mus,4469,0.607036
42,NCBITaxon:33208,Metazoa,669,0.090872
121,NCBITaxon:9606,Homo sapiens,599,0.081364
52,NCBITaxon:40674,Mammalia,184,0.024993
0,NCBITaxon:1,root,180,0.024450
...,...,...,...,...
71,NCBITaxon:6040,Porifera,1,0.000136
70,NCBITaxon:5774,Amoeba,1,0.000136
69,NCBITaxon:5690,Trypanosoma,1,0.000136
1,NCBITaxon:10040,Peromyscus,1,0.000136


In [None]:
#pr
pr_dir = '/content/CRAFT/concept-annotation/PR/PR/knowtator'

train1_data, train2_data = parse_xml_files(pr_dir, remaining_xml_files)

pr1_df = pd.DataFrame(train1_data)
pr2_df = pd.DataFrame(train2_data)

pr_df = pd.merge(pr1_df, pr2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

pr_class_counts = pr_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_pr = pr_class_counts['count'].sum()

pr_class_counts['proportion'] = pr_class_counts['count'] / total_count_pr

print(total_count_pr)

pr_class_counts

17038


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
606,PR:000013059,peroxisome proliferator-activated receptor gam...,412,0.024181
208,PR:000004503,transcriptional regulator ATRX,361,0.021188
451,PR:000009218,bifunctional arginine demethylase and lysyl-hy...,321,0.018840
397,PR:000008234,glutamate receptor 2,309,0.018136
184,PR:000004080,annexin A7,269,0.015788
...,...,...,...,...
492,PR:000010142,microtubule-associated protein 4,1,0.000059
490,PR:000010030,lysozyme C-2,1,0.000059
486,PR:000009884,lysyl oxidase homolog 1,1,0.000059
483,PR:000009863,rhombotin-2,1,0.000059


In [None]:
#so
so_dir = '/content/CRAFT/concept-annotation/SO/SO/knowtator'

train1_data, train2_data = parse_xml_files(so_dir, remaining_xml_files)

so1_df = pd.DataFrame(train1_data)
so2_df = pd.DataFrame(train2_data)

so_df = pd.merge(so1_df, so2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

so_class_counts = so_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_so = so_class_counts['count'].sum()

so_class_counts['proportion'] = so_class_counts['count'] / total_count_so

print(total_count_so)

so_class_counts

8797


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
85,SO:0000704,gene,2679,0.304536
124,SO:0001023,allele,693,0.078777
126,SO:0001026,genome,484,0.055019
68,SO:0000417,polypeptide_domain,406,0.046152
17,SO:0000147,exon,360,0.040923
...,...,...,...,...
132,SO:0001075,intramembrane_polypeptide_region,1,0.000114
135,SO:0001104,catalytic_residue,1,0.000114
138,SO:0001183,morpholino_backbone,1,0.000114
142,SO:0001250,fingerprint_map,1,0.000114


In [None]:
#go_bp
go_bp_dir = '/content/CRAFT/concept-annotation/GO_BP/GO_BP/knowtator'

train1_data, train2_data = parse_xml_files(go_bp_dir, remaining_xml_files)

go_bp1_df = pd.DataFrame(train1_data)
go_bp2_df = pd.DataFrame(train2_data)

go_bp_df = pd.merge(go_bp1_df, go_bp2_df, left_on='mention_id', right_on = 'class_mention_id', how='inner')

go_bp_class_counts = go_bp_df.groupby(['mention_class_id', 'mention_class_name']).size().reset_index(name='count').sort_values(by='count', ascending=False)

total_count_go_bp = go_bp_class_counts['count'].sum()

go_bp_class_counts['proportion'] = go_bp_class_counts['count'] / total_count_go_bp

print(total_count_go_bp)

go_bp_class_counts

9280


Unnamed: 0,mention_class_id,mention_class_name,count,proportion
246,GO:0010467,gene expression,2556,0.275431
561,GO:0065007,biological regulation,522,0.056250
205,GO:0007608,sensory perception of smell,443,0.047737
193,GO:0007567,parturition,312,0.033621
273,GO:0016265,death,169,0.018211
...,...,...,...,...
132,GO:0006913,nucleocytoplasmic transport,1,0.000108
406,GO:0043065,positive regulation of apoptotic process,1,0.000108
408,GO:0043241,protein complex disassembly,1,0.000108
131,GO:0006911,"phagocytosis, engulfment",1,0.000108


In [None]:
#mondo
mondo_dir = '/content/CRAFT/concept-annotation/MONDO/MONDO_without_genotype_annotations/knowtator-2'

train1_data, train2_data = parse_xml_files(mondo_dir, remaining_base_name)

mondo1_df = pd.DataFrame(train1_data)
mondo2_df = pd.DataFrame(train2_data)

mondo1_df

In [None]:
mondo2_df