In [None]:
!pip install requests



In [1]:
import xml.etree.ElementTree as ET
import os
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def get_test_file_bases(directory):
    test_file_bases = set()
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            base_name = os.path.splitext(filename)[0]
            test_file_bases.add(base_name)
    return test_file_bases

In [4]:
test_directory = "/content/drive/MyDrive/test"

test_file_bases = get_test_file_bases(test_directory)

In [5]:
test_file_bases

{'11319941',
 '11604102',
 '14624252',
 '14675480',
 '14691534',
 '15018652',
 '15070402',
 '15238161',
 '15328538',
 '15560850',
 '15615595',
 '15619330',
 '15784609',
 '15850489',
 '15882093',
 '16026622',
 '16027110',
 '16410827',
 '16517939',
 '16611361',
 '16787536',
 '16800892',
 '16968134',
 '17029558',
 '17201918',
 '17206865',
 '17465682',
 '17503968',
 '17565376',
 '17677002'}

In [6]:
def exclude_test_files(xml_directory, test_file_bases):
    remaining_files = []
    remaining_base_name = []
    for filename in os.listdir(xml_directory):
        if filename.endswith(".xml"):
            base_name = os.path.splitext(filename)[0].rsplit(".", 2)[0]
            if base_name not in test_file_bases:
                remaining_files.append(filename)
                remaining_base_name.append(base_name)
    return remaining_files, remaining_base_name

In [7]:
uberon_xml_directory = "/content/drive/MyDrive/uberon_xml"

remaining_xml_files, remaining_base_name = exclude_test_files(uberon_xml_directory, test_file_bases)
len(remaining_xml_files)

67

In [8]:
remaining_xml_files

['12925238.txt.knowtator.xml',
 '15345036.txt.knowtator.xml',
 '15492776.txt.knowtator.xml',
 '12079497.txt.knowtator.xml',
 '14609438.txt.knowtator.xml',
 '14723793.txt.knowtator.xml',
 '15314655.txt.knowtator.xml',
 '11597317.txt.knowtator.xml',
 '15314659.txt.knowtator.xml',
 '11897010.txt.knowtator.xml',
 '15040800.txt.knowtator.xml',
 '15061865.txt.knowtator.xml',
 '15005800.txt.knowtator.xml',
 '15630473.txt.knowtator.xml',
 '14611657.txt.knowtator.xml',
 '15328533.txt.knowtator.xml',
 '12546709.txt.knowtator.xml',
 '15760270.txt.knowtator.xml',
 '12585968.txt.knowtator.xml',
 '15676071.txt.knowtator.xml',
 '11532192.txt.knowtator.xml',
 '14737183.txt.knowtator.xml',
 '15320950.txt.knowtator.xml',
 '15550985.txt.knowtator.xml',
 '15207008.txt.knowtator.xml',
 '15819996.txt.knowtator.xml',
 '15921521.txt.knowtator.xml',
 '16670015.txt.knowtator.xml',
 '15588329.txt.knowtator.xml',
 '15938754.txt.knowtator.xml',
 '15836427.txt.knowtator.xml',
 '15917436.txt.knowtator.xml',
 '162557

In [9]:
def append_extension(bases, extension=".xml"):
    return [base + extension for base in bases]

In [10]:
remaining_base_name = append_extension(remaining_base_name, extension=".xml")

In [11]:
remaining_base_name

['12925238.xml',
 '15345036.xml',
 '15492776.xml',
 '12079497.xml',
 '14609438.xml',
 '14723793.xml',
 '15314655.xml',
 '11597317.xml',
 '15314659.xml',
 '11897010.xml',
 '15040800.xml',
 '15061865.xml',
 '15005800.xml',
 '15630473.xml',
 '14611657.xml',
 '15328533.xml',
 '12546709.xml',
 '15760270.xml',
 '12585968.xml',
 '15676071.xml',
 '11532192.xml',
 '14737183.xml',
 '15320950.xml',
 '15550985.xml',
 '15207008.xml',
 '15819996.xml',
 '15921521.xml',
 '16670015.xml',
 '15588329.xml',
 '15938754.xml',
 '15836427.xml',
 '15917436.xml',
 '16255782.xml',
 '16098226.xml',
 '15876356.xml',
 '16279840.xml',
 '16462940.xml',
 '16504174.xml',
 '16504143.xml',
 '16109169.xml',
 '16507151.xml',
 '16121256.xml',
 '16121255.xml',
 '16433929.xml',
 '16221973.xml',
 '16110338.xml',
 '16362077.xml',
 '16539743.xml',
 '16870721.xml',
 '16628246.xml',
 '16216087.xml',
 '16579849.xml',
 '16700629.xml',
 '16103912.xml',
 '17447844.xml',
 '17244351.xml',
 '17069463.xml',
 '17696610.xml',
 '17020410.xml

In [129]:
def parse_xml_files(xml_directory, xml_files):
    train1_data = []
    train2_data = []

    for filename in xml_files:
        filepath = os.path.join(xml_directory, filename)
        try:
            tree = ET.parse(filepath)
            root = tree.getroot()

            annotations = root.findall('annotation')
            for annotation in annotations:
                mention_id = annotation.find('mention').attrib.get('id', '')

                annotator_element = annotation.find('annotator')
                if annotator_element is not None:
                    annotator_id = annotator_element.attrib.get('id', '')
                    annotator_name = annotator_element.text
                else:
                    annotator_id = ''
                    annotator_name = ''

                spans = annotation.findall('span')
                span_start_end = [(span.attrib.get('start', ''), span.attrib.get('end', '')) for span in spans]
                spanned_text = annotation.find('spannedText').text
                filename = os.path.splitext(filename)[0]

                annotation_data = {
                    "mention_id": mention_id,
                    "annotator_id": annotator_id,
                    "annotator_name": annotator_name,
                    "span_start_end": span_start_end,
                    "spanned_text": spanned_text,
                    "article": filename
                }

                train1_data.append(annotation_data)

            class_mentions = root.findall('classMention')
            for class_mention in class_mentions:
                class_mention_id = class_mention.attrib.get('id', '')
                mention_class = class_mention.find('mentionClass')
                mention_class_id = mention_class.attrib.get('id', '')
                mention_class_name = mention_class.text
                filename = os.path.splitext(filename)[0]

                class_mention_data = {
                    "class_mention_id": class_mention_id,
                    "mention_class_id": mention_class_id,
                    "mention_class_name": mention_class_name,
                    "article": filename
                }

                train2_data.append(class_mention_data)

        except ET.ParseError as e:
            print(f"Error parsing {filename}: {e}")

    return train1_data, train2_data


In [130]:
def stats(dir):
  train1_data, train2_data = parse_xml_files(dir, remaining_xml_files)

  df1 = pd.DataFrame(train1_data)
  df2 = pd.DataFrame(train2_data)
  #df = pd.merge(df1, df2, left_on='mention_id', right_on = 'class_mention_id', how='inner')
  annotation_counts = df1.groupby(['article','spanned_text']).size().reset_index(name='count').sort_values(by='count', ascending=False)
  total_annotation = annotation_counts['count'].sum()
  annotation_counts['proportion'] = annotation_counts['count'] / total_annotation
  annotation_max =  annotation_counts[annotation_counts['count'] == annotation_counts['count'].max()]
  annotation_min =  annotation_counts[annotation_counts['count'] == annotation_counts['count'].min()]
  annotation_avg = annotation_counts['count'].mean()

  concept_counts = df2.groupby(['article','mention_class_id']).size().reset_index(name='count').sort_values(by='count', ascending=False)
  total_concept = concept_counts['count'].sum()
  concept_counts['proportion'] = concept_counts['count'] / total_concept
  concept_max =  concept_counts[concept_counts['count'] == concept_counts['count'].max()]
  concept_min =  concept_counts[concept_counts['count'] == concept_counts['count'].min()]
  concept_avg = concept_counts['count'].mean()


  return annotation_counts, concept_counts, annotation_max, annotation_min, annotation_avg, concept_max, concept_min, concept_avg, total_annotation, total_concept


In [133]:
uberon_annotation_counts, uberon_concept_counts, uberon_annotation_max, uberon_annotation_min, uberon_annotation_avg, uberon_concept_max, uberon_concept_min, uberon_concept_avg, uberon_total_annotation, uberon_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/UBERON/UBERON/knowtator')

chebi_annotation_counts, chebi_concept_counts, chebi_annotation_max, chebi_annotation_min, chebi_annotation_avg, chebi_concept_max, chebi_concept_min, chebi_concept_avg, chebi_total_annotation, chebi_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/CHEBI/CHEBI/knowtator')

cl_annotation_counts, cl_concept_counts, cl_annotation_max, cl_annotation_min, cl_annotation_avg, cl_concept_max, cl_concept_min, cl_concept_avg, cl_total_annotation, cl_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/CL/CL/knowtator')

mop_annotation_counts, mop_concept_counts, mop_annotation_max, mop_annotation_min, mop_annotation_avg, mop_concept_max, mop_concept_min, mop_concept_avg, mop_total_annotation, mop_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/MOP/MOP/knowtator')

NCBITaxon_annotation_counts, NCBITaxon_concept_counts, NCBITaxon_annotation_max, NCBITaxon_annotation_min, NCBITaxon_annotation_avg, NCBITaxon_concept_max, NCBITaxon_concept_min, NCBITaxon_concept_avg, NCBITaxon_total_annotation, NCBITaxon_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/NCBITaxon/NCBITaxon/knowtator')

pr_annotation_counts, pr_concept_counts, pr_annotation_max, pr_annotation_min, pr_annotation_avg, pr_concept_max, pr_concept_min, pr_concept_avg, pr_total_annotation, pr_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/PR/PR/knowtator')

so_annotation_counts, so_concept_counts, so_annotation_max, so_annotation_min, so_annotation_avg, so_concept_max, so_concept_min, so_concept_avg, so_total_annotation, so_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/SO/SO/knowtator')

go_bp_annotation_counts, go_bp_concept_counts, go_bp_annotation_max, go_bp_annotation_min, go_bp_annotation_avg, go_bp_concept_max, go_bp_concept_min, go_bp_concept_avg, go_bp_total_annotation, go_bp_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/GO_BP/GO_BP/knowtator')

In [112]:
uberon_annotation_counts

Unnamed: 0,article,spanned_text,count,proportion
2479,16700629,gonads,128,0.010433
1793,16109169,embryos,126,0.010270
219,12925238,blood,101,0.008232
1220,15588329,Cb,100,0.008151
3283,17608565,retina,97,0.007906
...,...,...,...,...
1416,15760270,tibialis anterior,1,0.000082
1419,15760270,trachea,1,0.000082
1421,15819996,Ammon's horn,1,0.000082
1423,15819996,Embryo,1,0.000082


In [113]:
uberon_concept_counts

Unnamed: 0,article,mention_class_id,count,proportion
1559,16700629,UBERON:0000991,204,0.016627
1112,16109169,UBERON:0000922,182,0.014834
305,15005800,UBERON:0002048,162,0.013204
2001,17608565,UBERON:0000966,161,0.013123
693,15492776,UBERON:0004905,159,0.012959
...,...,...,...,...
1009,15938754,UBERON:0001017,1,0.000082
1006,15921521,UBERON:0007023,1,0.000082
1004,15921521,UBERON:0005169,1,0.000082
1001,15921521,UBERON:0002330,1,0.000082


In [None]:
#mondo
mondo_annotation_counts, mondo_concept_counts, mondo_annotation_max, mondo_annotation_min, mondo_annotation_avg, mondo_concept_max, mondo_concept_min, mondo_concept_avg, mondo_total_annotation, mondo_total_concept = stats('/content/drive/MyDrive/PhenoTagger/CRAFT/concept-annotation/MONDO/MONDO_without_genotype_annotations/knowtator-2')


In [134]:
import numpy as np

In [153]:
data = np.array([[uberon_total_annotation, uberon_annotation_max['count'].iloc[0], uberon_annotation_min['count'].min(), uberon_annotation_avg, uberon_concept_max['count'].iloc[0], uberon_concept_min['count'].min(), uberon_concept_avg, uberon_total_concept],
                 [chebi_total_annotation,chebi_annotation_max['count'].iloc[0], chebi_annotation_min['count'].min(), chebi_annotation_avg, chebi_concept_max['count'].iloc[0], chebi_concept_min['count'].min(), chebi_concept_avg, chebi_total_concept],
                 [cl_total_annotation, cl_annotation_max['count'].iloc[0], cl_annotation_min['count'].min(), cl_annotation_avg, cl_concept_max['count'].iloc[0], cl_concept_min['count'].min(), cl_concept_avg, cl_total_concept],
                 [mop_total_annotation, mop_annotation_max['count'].iloc[0], mop_annotation_min['count'].min(), mop_annotation_avg, mop_concept_max['count'].iloc[0], mop_concept_min['count'].min(), mop_concept_avg, mop_total_concept],
                 [NCBITaxon_total_annotation, NCBITaxon_annotation_max['count'].iloc[0], NCBITaxon_annotation_min['count'].min(), NCBITaxon_annotation_avg, NCBITaxon_concept_max['count'].iloc[0], NCBITaxon_concept_min['count'].min(), NCBITaxon_concept_avg, NCBITaxon_total_concept],
                 [pr_total_annotation, pr_annotation_max['count'].iloc[0], pr_annotation_min['count'].min(), pr_annotation_avg, pr_concept_max['count'].iloc[0], pr_concept_min['count'].min(), pr_concept_avg, pr_total_concept],
                 [so_total_annotation, so_annotation_max['count'].iloc[0], so_annotation_min['count'].min(), so_annotation_avg, so_concept_max['count'].iloc[0], so_concept_min['count'].min(), so_concept_avg, so_total_concept]
                 ])
dataframe = pd.DataFrame(data, index= ['UBERON', 'CHEBI', 'CL', 'MOP', 'NCBITaxon', 'PR', 'SO'], columns = ['annot_total', 'annot_max', 'annot_min', 'annot_avg', 'concept_max', 'concept_min', 'concept_avg', 'concept_total'])

In [155]:
dataframe

Unnamed: 0,annot_total,annot_max,annot_min,annot_avg,concept_max,concept_min,concept_avg,concept_total
UBERON,12269.0,128.0,1.0,3.597947,204.0,1.0,5.990723,12269.0
CHEBI,4548.0,89.0,1.0,2.473083,101.0,1.0,3.108681,4548.0
CL,4043.0,102.0,1.0,3.255233,152.0,1.0,6.958692,4043.0
MOP,240.0,18.0,1.0,1.621622,24.0,1.0,2.823529,240.0
NCBITaxon,7362.0,223.0,1.0,6.160669,272.0,1.0,10.669565,7362.0
PR,17038.0,398.0,1.0,8.18741,403.0,1.0,13.331768,17038.0
SO,8797.0,120.0,1.0,4.293314,205.0,1.0,7.234375,8797.0
