# Dataset Statistics

## Scraped Textbook Data

In [2]:
import pandas as pd
terms = pd.read_pickle("data/preprocessed/terms/processed_terms.pkl")
textbooks = ['Anatomy_and_Physiology', 'Astronomy', 'Biology_2e', 'Chemistry_2e', 'Life_Biology', 'Microbiology', 'Psychology', 
             'University_Physics_Volume_1', 'University_Physics_Volume_2', 'University_Physics_Volume_3']

dataset_df = {'textbook': [], 'num_terms': [], 'num_sentences': []}
for textbook in textbooks:
    dataset_df['textbook'].append(textbook)
    dataset_df['num_terms'].append(terms[terms.source == textbook].shape[0])
    dataset_df['num_sentences'].append(pd.read_pickle(f"data/preprocessed/clean_sentences/{textbook}_sentences.pkl").shape[0])
dataset_df = pd.DataFrame(dataset_df)
dataset_df

Anatomy_and_Physiology
Astronomy
Biology_2e
Chemistry_2e
Life_Biology
Microbiology
Psychology
University_Physics_Volume_1
University_Physics_Volume_2
University_Physics_Volume_3


Unnamed: 0,textbook,num_terms,num_sentences
0,Anatomy_and_Physiology,3169,21706
1,Astronomy,810,18844
2,Biology_2e,2757,24544
3,Chemistry_2e,954,13799
4,Life_Biology,0,16673
5,Microbiology,4149,16190
6,Psychology,1086,9967
7,University_Physics_Volume_1,462,15005
8,University_Physics_Volume_2,466,11779
9,University_Physics_Volume_3,580,9250


## Bio101 KB Data

In [5]:
import pandas as pd
terms = pd.read_pickle("data/preprocessed/terms/processed_terms.pkl")
print(f"# of Bio101 KB terms: {terms[terms.source == 'kb_bio101'].shape[0]}")

# of Bio101 KB terms: 8136


In [15]:
import pickle
with open("data/preprocessed/kb_bio101_relations_db.pkl", 'rb') as fid:
    db = pickle.load(fid)
df = {'relation': [], 'count': []}
for r in db:
    df['relation'].append(r)
    df['count'].append(len(db[r]))
df = pd.DataFrame(df).sort_values(['count'], ascending=False)
df

Unnamed: 0,relation,count
1,subclass-of,18360
0,synonym,12308
2,has-part,6542
47,result,3648
56,raw-material,3526
49,object,3021
4,has-region,2976
51,subevent,2894
46,base,2396
50,agent,1807


## Term Extraction Data Splits

In [16]:
import pandas as pd
splits_df = {'split': [], 'num_sentences': [], 'num_terms': []}
for split in ['train', 'dev', 'test']:
    df = pd.read_pickle(f"data/term_extraction/term_extraction_{split}.pkl")
    terms = set()
    for t in df.terms:
        terms = terms | t
    if split == 'train':
        tts = terms
    elif split == 'dev':
        dts = terms
    splits_df['split'].append(split)
    splits_df['num_sentences'].append(df.shape[0])
    splits_df['num_terms'].append(len(terms))
splits_df = pd.DataFrame(splits_df)
splits_df

Unnamed: 0,split,num_sentences,num_terms
0,train,57178,7085
1,dev,206,283
2,test,608,395


# Term Extraction Results

In [7]:
import json
with open('term_extraction/saved/models/BertSoftmax/0610_044955/dev-metrics.json', 'r') as fid:
    metrics = json.load(fid)
metrics

{'loss': 0.5417171896082683,
 'accuracy': 0.9911635315533981,
 'token_macro_f1': 0.751783699251718,
 'token_macro_precision': 0.6911132162223421,
 'token_macro_recall': 0.8363895652535847,
 'token_micro_f1': 0.7589478223372144,
 'token_micro_precision': 0.7148659626320065,
 'token_micro_recall': 0.8088235294117647,
 'term_f1': 0.7060755336617405,
 'term_recall': 0.7597173144876325,
 'term_precision': 0.6595092024539877}

# Relation Extraction Data Splits

In [1]:
import pandas as pd
splits_df = {'split': [], 'instances': [], 'sentences': [], 'term_pairs': []}
for split in ['train', 'dev', 'test']:
    df = pd.read_pickle(f"data/relation_extraction/{split}.pkl")
    splits_df['split'].append(split)
    splits_df['instances'].append(df.shape[0])
    splits_df['sentences'].append(len(df.sentence.unique()))
    splits_df['term_pairs'].append(len(df.term_pair.unique()))
splits_df = pd.DataFrame(splits_df)
splits_df

Unnamed: 0,split,instances,sentences,term_pairs
0,train,168579,29918,106012
1,dev,11983,2066,8687
2,test,3111,491,2537


In [4]:
dev = pd.read_pickle(f"data/relation_extraction/dev.pkl")
dev.groupby('gold_label').term_pair.count() / dev.shape[0]

gold_label
HAS-PART/REGION    0.038137
OTHER              0.880164
PART/REGION-OF     0.037720
SUBCLASS           0.017274
SUPERCLASS         0.021781
SYNONYM            0.004924
Name: term_pair, dtype: float64

In [5]:
test = pd.read_pickle(f"data/relation_extraction/test.pkl")
test.groupby('gold_label').term_pair.count() / test.shape[0]

gold_label
HAS-PART/REGION    0.005464
OTHER              0.965284
PART/REGION-OF     0.005464
SUBCLASS           0.007072
SUPERCLASS         0.015751
SYNONYM            0.000964
Name: term_pair, dtype: float64

# Relation Extraction Results

In [8]:
with open('relation_extraction/saved/models/soft_label/0614_012543/dev-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.68      0.83      0.75       207
     SUPERCLASS       0.70      0.72      0.71       261
        SYNONYM       0.67      0.81      0.73        59
HAS-PART/REGION       0.62      0.48      0.54       456
 PART/REGION-OF       0.59      0.33      0.43       452

      micro avg       0.65      0.54      0.59      1435
      macro avg       0.65      0.64      0.63      1435
   weighted avg       0.64      0.54      0.57      1435



In [9]:
with open('relation_extraction/saved/models/soft_label/0614_012543/dev-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.66      0.85      0.74       165
     SUPERCLASS       0.69      0.74      0.71       223
        SYNONYM       0.63      0.79      0.70        47
HAS-PART/REGION       0.59      0.63      0.61       227
 PART/REGION-OF       0.50      0.38      0.43       207

      micro avg       0.62      0.65      0.63       869
      macro avg       0.61      0.68      0.64       869
   weighted avg       0.61      0.65      0.62       869

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.63      0.25      0.36       165
     SUPERCLASS       0.71      0.23      0.35       223
        SYNONYM       0.48      0.21      0.29        47
HAS-PART/REGION       0.89      0.66      0.75       227
 PART/REGION-OF       0.94      0.61      0.74       207

      micro avg       0.82      0.44      0.57       869
      macro avg       0.73  

In [10]:
with open('relation_extraction/saved/models/hard_label/0614_161610/dev-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.71      0.66      0.68       207
     SUPERCLASS       0.65      0.70      0.68       261
        SYNONYM       0.65      0.56      0.60        59
HAS-PART/REGION       0.70      0.23      0.35       456
 PART/REGION-OF       0.63      0.10      0.18       452

      micro avg       0.67      0.35      0.46      1435
      macro avg       0.67      0.45      0.50      1435
   weighted avg       0.67      0.35      0.41      1435



In [11]:
with open('relation_extraction/saved/models/hard_label/0614_161610/dev-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.70      0.70      0.70       165
     SUPERCLASS       0.68      0.72      0.70       223
        SYNONYM       0.61      0.53      0.57        47
HAS-PART/REGION       0.70      0.35      0.46       227
 PART/REGION-OF       0.60      0.17      0.27       207

      micro avg       0.68      0.48      0.56       869
      macro avg       0.66      0.50      0.54       869
   weighted avg       0.67      0.48      0.53       869

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.63      0.25      0.36       165
     SUPERCLASS       0.71      0.23      0.35       223
        SYNONYM       0.48      0.21      0.29        47
HAS-PART/REGION       0.89      0.66      0.75       227
 PART/REGION-OF       0.94      0.61      0.74       207

      micro avg       0.82      0.44      0.57       869
      macro avg       0.73  