In [30]:
import json
import os
import re
import pandas as pd
import numpy as np
import sys
import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage
import warnings
module_path = os.path.abspath(os.path.join('../data_processing'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_processing_utils import read_spacy_docs


snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)
warnings.filterwarnings('ignore')

pre_data_dir = "../data/preprocessed_data/"
te_data_dir = "../data/term_extraction/"

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

# Summary of Preprocessed Textbook Data

In [45]:
textbook_data = {} 
files = os.listdir(pre_data_dir)
for file in files:
    if file == "Life_Biology_sentences_spacy" or file == "Life_Biology_kb_lexicon.csv":
        continue
    
    textbook = re.match("(.*)_(key|sentences).*", file).group(1)
    if textbook not in textbook_data:
        textbook_data[textbook] = {}
    
    if "sentences" in file:
        sentences = read_spacy_docs(f"{pre_data_dir}/{file}", nlp)
        textbook_data[textbook]["sentence_count"] = len(sentences)
    elif "key_terms" in file or "kb_terms" in file:
        terms = read_spacy_docs(f"{pre_data_dir}/{file}", nlp)
        terms = set([" ".join(t.lemma_ for t in term) for term in terms])
        textbook_data[textbook]["terms"] = terms 
        textbook_data[textbook]["term_count"] = len(terms)

## Textbook Term & Sentence Counts

In [46]:
df = {"textbook": [], "sentence_count": [], "term_count": []}
for textbook in textbook_data:
    df["textbook"].append(textbook)
    df["sentence_count"].append(textbook_data[textbook]["sentence_count"])
    df["term_count"].append(textbook_data[textbook]["term_count"])
df = pd.DataFrame(df)
df

Unnamed: 0,textbook,sentence_count,term_count
0,Psychology,10429,848
1,Biology_2e,25432,2225
2,University_Physics_Volume_3,12671,382
3,Anatomy_and_Physiology,22440,2553
4,Chemistry_2e,16720,556
5,Astronomy,21668,303
6,University_Physics_Volume_2,16539,279
7,Life_Biology_kb,7430,8284
8,University_Physics_Volume_1,21310,190
9,Microbiology,19830,1792


## Textbook Term Co-Occurrence Counts

In [51]:
textbooks = textbook_data.keys()
df_frac = pd.DataFrame(np.zeros((len(textbooks), len(textbooks))))
df_frac.columns = textbooks 
df_frac.index = textbooks 

for i, split1 in enumerate(textbooks):
    for j, split2 in enumerate(textbooks):
        count_overlap = len(textbook_data[split1]["terms"].intersection(textbook_data[split2]["terms"]))
        df_frac.iloc[i, j] = count_overlap / len(textbook_data[split1]["terms"])
print("Fraction of terms in row that overlap with column")
df_frac

Fraction of terms in row that overlap with column


Unnamed: 0,Psychology,Biology_2e,University_Physics_Volume_3,Anatomy_and_Physiology,Chemistry_2e,Astronomy,University_Physics_Volume_2,Life_Biology_kb,University_Physics_Volume_1,Microbiology
Psychology,1.0,0.108491,0.003538,0.104953,0.004717,0.005896,0.001179,0.136792,0.005896,0.028302
Biology_2e,0.041348,1.0,0.004944,0.235506,0.017528,0.008989,0.004494,0.378876,0.002247,0.124045
University_Physics_Volume_3,0.007853,0.028796,1.0,0.020942,0.065445,0.054974,0.007853,0.054974,0.005236,0.013089
Anatomy_and_Physiology,0.034861,0.205249,0.003134,1.0,0.010967,0.0047,0.0047,0.187622,0.001958,0.058754
Chemistry_2e,0.007194,0.070144,0.044964,0.05036,1.0,0.014388,0.034173,0.136691,0.01259,0.019784
Astronomy,0.016502,0.066007,0.069307,0.039604,0.026403,1.0,0.026403,0.125413,0.016502,0.042904
University_Physics_Volume_2,0.003584,0.035842,0.010753,0.043011,0.0681,0.028674,1.0,0.086022,0.010753,0.0
Life_Biology_kb,0.014003,0.101762,0.002535,0.057822,0.009174,0.004587,0.002897,1.0,0.001931,0.039715
University_Physics_Volume_1,0.026316,0.026316,0.010526,0.026316,0.036842,0.026316,0.015789,0.084211,1.0,0.010526
Microbiology,0.013393,0.154018,0.00279,0.083705,0.006138,0.007254,0.0,0.183594,0.001116,1.0


# Summary of Data Splits

- num sentences
- num terms
- avg term appearance

In [26]:
df = {"split": [], "sentence_count": [], "term_count": []}
for split in ["train", "validation", "life_test", "psych_test"]:
    with open(f"{te_data_dir}/term_extraction_{split}.json", "r") as f:
        split_data = json.load(f)
    df["split"].append(split)
    df["sentence_count"].append(len(split_data["sentences"]))
    df["term_count"].append(len(split_data["terms"]))
df = pd.DataFrame(df)
df

Unnamed: 0,split,sentence_count,term_count
0,train,131178,6179
1,validation,25432,3305
2,life_test,26160,1926
3,psych_test,10429,1270


# Term Overlap Amongst Splits

In [34]:
split_terms = {}
for split in ["train", "validation", "life_test", "psych_test"]:
    with open(f"{te_data_dir}/term_extraction_{split}.json", "r") as f:
        split_data = json.load(f)
    split_terms[split] = set(split_data["terms"].keys())

In [41]:
splits = ["train", "validation", "life_test", "psych_test"]
df_count = pd.DataFrame(np.zeros((4, 4)))
df_count.columns = splits 
df_count.index = splits
df_frac = pd.DataFrame(np.zeros((4, 4)))
df_frac.columns = splits 
df_frac.index = splits

for i, split1 in enumerate(splits):
    for j, split2 in enumerate(["train", "validation", "life_test", "psych_test"]):
        count_overlap = len(split_terms[split1].intersection(split_terms[split2]))
        df_count.iloc[i, j] = count_overlap
        df_frac.iloc[i, j] = count_overlap / len(split_terms[split1])
print("Percent of terms for row split that overlap with column split")
df_frac

Percent of terms for row split that overlap with column split


Unnamed: 0,train,validation,life_test,psych_test
train,1.0,0.410261,0.291147,0.127691
validation,0.76702,1.0,0.507716,0.209985
life_test,0.93406,0.871236,1.0,0.323988
psych_test,0.62126,0.546457,0.491339,1.0


# Distribution of Term Counts