AKA T1

In [1]:
import re

In [2]:
def remove_comments(text):
    # Split the text into lines
    lines = text.split('\n')
    # Filter out lines that don't start with "%"
    filtered_lines = [line for line in lines if not line.strip().startswith('%')]
    # Join the filtered lines back together
    cleaned_text = '\n'.join(filtered_lines)
    return cleaned_text

In [75]:
def remove_simples(text):
    text = text.replace("$$", "$")
    text = text.replace("\\begin{equation}", "$")
    text = text.replace("\\end{equation}", " $ .")
    text = text.replace("\\begin{equation*}", " $ ")
    text = text.replace("\\end{equation*}", " $ .")
    text = text.replace("\\begin{align*}", " $")
    text = text.replace("\\end{align*}", " $ .")
    text = text.replace("\\begin{align}", " $")
    text = text.replace("\\end{align}", " $ .")
    text = text.replace("\\[", "$")
    text = text.replace("\\]", "$")
    text = text.replace("\\(", "$")
    text = text.replace("\\)", "$")
    text = text.replace(".$", "$.")
    text = text.replace("$", " $ ")
    text = text.replace("-", " - ")
    text = text.replace("%", "")
    text = text.replace("\\item", "")
    return text

In [74]:
def remove_figures_and_exercises(text):
    pattern = r"\\begin\{figure\}[\s\S]*?\\end\{figure\}"
    # Use the sub() function to remove all matches of the pattern
    new_text = re.sub(pattern, "", text, flags=re.DOTALL)
    pattern2 = r"\\begin\{exercises\}[\s\S].*?\\end\{exercises\}"
    # Use the sub() function to remove all matches of the pattern
    new_text = re.sub(pattern2, "", new_text,flags=re.DOTALL)
    pattern3 = r"\\begin\{minipage\}[\s\S].*?\\end\{minipage\}"
    # Use the sub() function to remove all matches of the pattern
    new_text = re.sub(pattern3, "", new_text,flags=re.DOTALL)
    pattern3 = r"\\begin\{center\}[\s\S].*?\\end\{center\}"
    # Use the sub() function to remove all matches of the pattern
    new_text = re.sub(pattern3, "", new_text,flags=re.DOTALL)
    return new_text

In [6]:
def remove_environments(text):
    
    pattern = r'\\begin\{[^\}]+\}'
    # Substitute the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    pattern2 = r'\\end\{[^\}]+\}'
    # Substitute the pattern with an empty string
    cleaned_text = re.sub(pattern2, '', cleaned_text)
    return cleaned_text

In [18]:
def remove_emphasis(text):
    text = re.sub(r"\\demph\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\emph\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\definend\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\textit\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\textbf\{([^{}]*)\}", r"\1", text)
    return text

In [60]:
def remove_formatting(text):
    text = re.sub(r'~\\[^}]+}', ' $ X $ ', text)
    text = re.sub(r'\\nearby[^\}]+\}', ' $ X $ ', text)

    text = re.sub(r"\\chapter\{.*?\}", "", text)
    text = re.sub(r"\\section\{.*?\}", "", text)
    text = re.sub(r"\\subsection\{.*?\}", "", text)
    text = re.sub(r"\\subsectionoptional\{.*?\}", "", text)
    text = re.sub(r"\\label\{.*?\}", "", text)
    text = re.sub(r"\\ntn\{.*?\}", "", text)
    text = re.sub(r"\\index\{.*?\}", "", text)
    text = re.sub(r"\\ref\{.*?\}", "", text)
    text = re.sub(r"\\bref\{.*?\}", "", text)
    text = re.sub(r"\\eqref\{.*?\}", "", text)
    text = re.sub(r"\\cite\{.*?\}", "", text)

    return text

In [10]:
def prep_w_spaces(text):
    text = text.replace("\\\\", " \\")
    text = text.replace("\{", "{")
    text = text.replace("\}", " } ") 
    text = text.replace(",", ", ")
    return text

In [11]:
def list_fix(text):
    text = text.replace("1.", "(i)")
    text = text.replace("2.", "(ii)")
    text = text.replace("3.", "(iii)")
    text = text.replace("4.", "(iv)")
    text = text.replace("5.", "(v)")
    text = text.replace("6.", "(vi)")
    text = text.replace("7.", "(vii)")
    text = text.replace("8.", "(viii)")
    text = text.replace("9.", "(ix)")
    text = text.replace("10.", "(x)")

    return text

In [39]:
def final_sweep(text):
    words = [word.replace("\\", "") for word in text.split() if not word.startswith("Helvetica")]
    text = " ".join(words)
    text = text.replace('~', " ")
    return text

In [76]:
def clean_everything(text):
    text = remove_figures_and_exercises(text)
    text = remove_comments(text)
    text = remove_simples(text)
    text = remove_environments(text)
    text = remove_emphasis(text)
    text = remove_formatting(text)
    text = prep_w_spaces(text)
    text = list_fix(text)
    text = final_sweep(text)
    return text

In [14]:
import spacy

from spacy_conll import init_parser

nlp = init_parser("en_core_web_sm", "spacy")
from spacy.language import Language
@Language.component("detextor")
def detextor(doc):
    dollar_indices = [index for index, token in enumerate(doc) if token.text == "$"]
    while len(dollar_indices) > 1:
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[dollar_indices[0]:dollar_indices[1] + 1])
        dollar_indices = [index for index, token in enumerate(doc) if token.text == "$"]
    return doc
# nlp.remove_pipe("detextor") 
# you might need to add the above line back in if you run this block more than once
nlp.add_pipe("detextor", after="tagger")           

<function __main__.detextor(doc)>

In [79]:
import os

# Loop through every file name in the folder
for filename in os.listdir("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/linalg/tex"):
    # Construct the full path to the file
    full_path = os.path.join("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/linalg/tex", filename)
    with open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/linalg/linalg_clean.txt","a") as f:
        f.write(clean_everything(open(full_path).read()) + '\n')


In [80]:
text = open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/linalg/linalg_clean.txt","r").readlines()
sent_lengths = []
with open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/linalg/linalg.conllu", "a") as f:
        j = 1
        for line in text:
            text2 = clean_everything(line)
            doc = nlp(text2)
            
            for sent in doc.sents:
                doc2 = nlp(sent.text)
                conll = doc2._.conll_str
                sent_lengths.append(len(doc2))
                f.write("# sent_id = " + str(j) + "\n")
                f.write("# text = " + sent.text + "\n")
                f.write(conll + "\n")
                j = j + 1

In [81]:
sents = open("textbooks/linalg/linalg_sents.txt","a")
with open("textbooks/linalg/linalg.conllu","r") as a:
    for line in a.readlines():
        if "# text = " in line:
            sents.write(line[9:])