# Dataset Statistics

## Scraped Textbook Data

In [2]:
import pandas as pd
terms = pd.read_pickle("data/preprocessed/terms/processed_terms.pkl")
textbooks = ['Anatomy_and_Physiology', 'Astronomy', 'Biology_2e', 'Chemistry_2e', 'Life_Biology', 'Microbiology', 'Psychology', 
             'University_Physics_Volume_1', 'University_Physics_Volume_2', 'University_Physics_Volume_3']

dataset_df = {'textbook': [], 'num_terms': [], 'num_sentences': []}
for textbook in textbooks:
    dataset_df['textbook'].append(textbook)
    dataset_df['num_terms'].append(terms[terms.source == textbook].shape[0])
    dataset_df['num_sentences'].append(pd.read_pickle(f"data/preprocessed/clean_sentences/{textbook}_sentences.pkl").shape[0])
dataset_df = pd.DataFrame(dataset_df)
dataset_df

Unnamed: 0,textbook,num_terms,num_sentences
0,Anatomy_and_Physiology,3169,21706
1,Astronomy,810,18844
2,Biology_2e,2757,24544
3,Chemistry_2e,954,13799
4,Life_Biology,0,16673
5,Microbiology,4149,16190
6,Psychology,1086,9967
7,University_Physics_Volume_1,462,15005
8,University_Physics_Volume_2,466,11779
9,University_Physics_Volume_3,580,9250


## Bio101 KB Data

In [3]:
import pandas as pd
terms = pd.read_pickle("data/preprocessed/terms/processed_terms.pkl")
print(f"# of Bio101 KB terms: {terms[terms.source == 'kb_bio101'].shape[0]}")

# of Bio101 KB terms: 8136


In [4]:
import pickle
with open("data/preprocessed/kb_bio101_relations_db.pkl", 'rb') as fid:
    db = pickle.load(fid)
df = {'relation': [], 'count': []}
for r in db:
    df['relation'].append(r)
    df['count'].append(len(db[r]))
df = pd.DataFrame(df).sort_values(['count'], ascending=False)
df

Unnamed: 0,relation,count
1,subclass-of,18360
0,synonym,12308
2,has-part,6542
47,result,3648
56,raw-material,3526
49,object,3021
4,has-region,2976
51,subevent,2894
46,base,2396
50,agent,1807


## Term Extraction Data Splits

In [5]:
import pandas as pd
splits_df = {'split': [], 'num_sentences': [], 'num_terms': []}
for split in ['train', 'dev', 'test']:
    df = pd.read_pickle(f"data/term_extraction/term_extraction_{split}.pkl")
    terms = set()
    for t in df.terms:
        terms = terms | t
    if split == 'train':
        tts = terms
    elif split == 'dev':
        dts = terms
    splits_df['split'].append(split)
    splits_df['num_sentences'].append(df.shape[0])
    splits_df['num_terms'].append(len(terms))
splits_df = pd.DataFrame(splits_df)
splits_df

Unnamed: 0,split,num_sentences,num_terms
0,train,58057,7041
1,dev,206,283
2,test,608,500


# Term Extraction Results

In [6]:
import json
with open('term_extraction/saved/models/BertSoftmax/0624_025137/dev-metrics.json', 'r') as fid:
    metrics = json.load(fid)
metrics

{'loss': 0.6661314779114955,
 'accuracy': 0.9911635315533981,
 'token_macro_f1': 0.7390851418763833,
 'token_macro_precision': 0.7494545950033281,
 'token_macro_recall': 0.7348825726831907,
 'token_micro_f1': 0.7456382001836548,
 'token_micro_precision': 0.744954128440367,
 'token_micro_recall': 0.7463235294117647,
 'term_f1': 0.6594982078853046,
 'term_recall': 0.6501766784452296,
 'term_precision': 0.6690909090909091}

In [7]:
import json
with open('term_extraction/saved/models/BertSoftmax/0624_025137/test-metrics.json', 'r') as fid:
    metrics = json.load(fid)
metrics

{'loss': 1.40711688367944,
 'accuracy': 0.986681486430921,
 'token_macro_f1': 0.5589522568377555,
 'token_macro_precision': 0.604545818324427,
 'token_macro_recall': 0.523563109543713,
 'token_micro_f1': 0.5822563744324136,
 'token_micro_precision': 0.6409073433294886,
 'token_micro_recall': 0.53344,
 'term_f1': 0.551341350601295,
 'term_recall': 0.596,
 'term_precision': 0.5129087779690189}

# Relation Extraction Data Splits

In [8]:
import pandas as pd
splits_df = {'split': [], 'instances': [], 'sentences': [], 'term_pairs': []}
for split in ['train', 'dev', 'test']:
    df = pd.read_pickle(f"data/relation_extraction/{split}.pkl")
    splits_df['split'].append(split)
    splits_df['instances'].append(df.shape[0])
    splits_df['sentences'].append(len(df.sentence.unique()))
    splits_df['term_pairs'].append(len(df.term_pair.unique()))
splits_df = pd.DataFrame(splits_df)
splits_df

Unnamed: 0,split,instances,sentences,term_pairs
0,train,167093,29830,105236
1,dev,11935,2065,8642
2,test,3106,491,2582


In [9]:
dev = pd.read_pickle(f"data/relation_extraction/dev.pkl")
dev.groupby('gold_label').term_pair.count() / dev.shape[0]

gold_label
HAS-PART/REGION    0.037956
OTHER              0.880938
PART/REGION-OF     0.037453
SUBCLASS           0.017009
SUPERCLASS         0.021701
SYNONYM            0.004943
Name: term_pair, dtype: float64

In [10]:
test = pd.read_pickle(f"data/relation_extraction/test.pkl")
test.groupby('gold_label').term_pair.count() / test.shape[0]

gold_label
HAS-PART/REGION    0.004829
OTHER              0.969736
PART/REGION-OF     0.004507
SUBCLASS           0.005795
SUPERCLASS         0.014166
SYNONYM            0.000966
Name: term_pair, dtype: float64

# Relation Extraction Results

In [15]:
with open('relation_extraction/saved/models/soft_label/0626_160708/dev-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.71      0.78      0.75       203
     SUPERCLASS       0.61      0.67      0.64       259
        SYNONYM       0.62      0.75      0.68        59
HAS-PART/REGION       0.63      0.34      0.44       452
 PART/REGION-OF       0.46      0.15      0.23       447

      micro avg       0.62      0.42      0.50      1420
      macro avg       0.61      0.54      0.55      1420
   weighted avg       0.58      0.42      0.47      1420



In [17]:
with open('relation_extraction/saved/models/soft_label/0626_160708/test-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.31      0.89      0.46        18
     SUPERCLASS       0.52      0.77      0.62        44
        SYNONYM       0.25      1.00      0.40         3
HAS-PART/REGION       0.14      0.40      0.21        15
 PART/REGION-OF       0.21      0.86      0.33        14

      micro avg       0.31      0.76      0.44        94
      macro avg       0.29      0.78      0.40        94
   weighted avg       0.36      0.76      0.47        94



In [16]:
with open('relation_extraction/saved/models/soft_label/0626_160708/dev-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.71      0.80      0.75       163
     SUPERCLASS       0.62      0.69      0.66       221
        SYNONYM       0.58      0.70      0.63        47
HAS-PART/REGION       0.60      0.50      0.55       225
 PART/REGION-OF       0.39      0.19      0.26       206

      micro avg       0.61      0.54      0.57       862
      macro avg       0.58      0.58      0.57       862
   weighted avg       0.58      0.54      0.55       862

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.65      0.25      0.36       163
     SUPERCLASS       0.71      0.24      0.35       221
        SYNONYM       0.50      0.21      0.30        47
HAS-PART/REGION       0.88      0.65      0.75       225
 PART/REGION-OF       0.93      0.61      0.74       206

      micro avg       0.82      0.44      0.57       862
      macro avg       0.73  

In [18]:
with open('relation_extraction/saved/models/soft_label/0626_160708/test-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.32      0.88      0.47        17
     SUPERCLASS       0.53      0.76      0.62        41
        SYNONYM       0.25      1.00      0.40         3
HAS-PART/REGION       0.14      0.50      0.22        12
 PART/REGION-OF       0.17      0.80      0.28        10

      micro avg       0.30      0.76      0.43        83
      macro avg       0.28      0.79      0.40        83
   weighted avg       0.37      0.76      0.48        83

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.33      0.06      0.10        17
     SUPERCLASS       0.69      0.22      0.33        41
        SYNONYM       0.00      0.00      0.00         3
HAS-PART/REGION       0.55      0.50      0.52        12
 PART/REGION-OF       0.50      0.40      0.44        10

      micro avg       0.57      0.24      0.34        83
      macro avg       0.41  

In [19]:
with open('relation_extraction/saved/models/hard_label/0626_231929/dev-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.65      0.79      0.71       203
     SUPERCLASS       0.73      0.55      0.63       259
        SYNONYM       0.57      0.58      0.57        59
HAS-PART/REGION       0.57      0.35      0.43       452
 PART/REGION-OF       0.47      0.08      0.13       447

      micro avg       0.62      0.37      0.47      1420
      macro avg       0.60      0.47      0.50      1420
   weighted avg       0.58      0.37      0.42      1420



In [20]:
with open('relation_extraction/saved/models/hard_label/0626_231929/test-instance-metrics.txt', 'r') as fid:
    print(fid.read())

                 precision    recall  f1-score   support

       SUBCLASS       0.36      0.94      0.52        18
     SUPERCLASS       0.70      0.70      0.70        44
        SYNONYM       0.38      1.00      0.55         3
HAS-PART/REGION       0.19      0.33      0.24        15
 PART/REGION-OF       0.25      0.21      0.23        14

      micro avg       0.43      0.63      0.51        94
      macro avg       0.38      0.64      0.45        94
   weighted avg       0.48      0.63      0.52        94



In [21]:
with open('relation_extraction/saved/models/hard_label/0626_231929/dev-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.64      0.81      0.72       163
     SUPERCLASS       0.76      0.56      0.64       221
        SYNONYM       0.51      0.53      0.52        47
HAS-PART/REGION       0.55      0.52      0.54       225
 PART/REGION-OF       0.45      0.15      0.22       206

      micro avg       0.61      0.50      0.55       862
      macro avg       0.58      0.51      0.53       862
   weighted avg       0.60      0.50      0.52       862

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.65      0.25      0.36       163
     SUPERCLASS       0.71      0.24      0.35       221
        SYNONYM       0.50      0.21      0.30        47
HAS-PART/REGION       0.88      0.65      0.75       225
 PART/REGION-OF       0.93      0.61      0.74       206

      micro avg       0.82      0.44      0.57       862
      macro avg       0.73  

In [22]:
with open('relation_extraction/saved/models/hard_label/0626_231929/test-term-pair-metrics.txt', 'r') as fid:
    print(fid.read())

Predicted Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.36      0.94      0.52        17
     SUPERCLASS       0.72      0.68      0.70        41
        SYNONYM       0.38      1.00      0.55         3
HAS-PART/REGION       0.19      0.42      0.26        12
 PART/REGION-OF       0.27      0.30      0.29        10

      micro avg       0.43      0.66      0.52        83
      macro avg       0.38      0.67      0.46        83
   weighted avg       0.50      0.66      0.54        83

Bio101 Term Pair Metrics:
                 precision    recall  f1-score   support

       SUBCLASS       0.33      0.06      0.10        17
     SUPERCLASS       0.69      0.22      0.33        41
        SYNONYM       0.00      0.00      0.00         3
HAS-PART/REGION       0.55      0.50      0.52        12
 PART/REGION-OF       0.50      0.40      0.44        10

      micro avg       0.57      0.24      0.34        83
      macro avg       0.41  