AKA T1

In [1]:
import re

In [60]:
def remove_comments(text):
    # Split the text into lines
    lines = text.split('\n')
    # Filter out lines that don't start with "%"
    filtered_lines = [line for line in lines if not line.strip().startswith('%') and not line.strip().startswith("#") and not line.strip().startswith("*") and not line.strip().startswith("+--") and not line.strip().startswith("=--") and not line.strip().startswith('{:')]
    # Join the filtered lines back together
    cleaned_text = '\n'.join(filtered_lines)
    return cleaned_text

In [69]:
def clean_links(text):
    text = re.sub(r'\[\[![^\]]+\]\]', '', text)
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
    start_indices = [match.start() for match in re.finditer(r'\[\[', text)]
    
    for index in start_indices:
        if text[index - 1] == '!':
            close_index = text.find(']', index)
            text = text.replace(text[index - 1:close_index + 3], (close_index + 3 - index)*'#')
        else:
            next_pipe = text.find('|', index)
            close_index = text.find(']', index)
            if next_pipe != -1:
                if next_pipe - close_index < 0:
                    text = text.replace(text[index:next_pipe + 1], (next_pipe - index + 1)*'#')
    text = text.replace('#', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    
    return text

In [4]:
def remove_simples(text):
    text = text.replace("$$", "$")
    text = text.replace("\\begin{equation}", "$")
    text = text.replace("\\end{equation}", " $ .")
    text = text.replace("\\begin{equation*}", " $ ")
    text = text.replace("\\end{equation*}", " $ .")
    text = text.replace("\\begin{align*}", " $")
    text = text.replace("\\end{align*}", " $ .")
    text = text.replace("\\begin{align}", " $")
    text = text.replace("\\end{align}", " $ .")
    text = text.replace("\\[", "$")
    text = text.replace("\\]", "$")
    text = text.replace("\\(", "$")
    text = text.replace("\\)", "$")
    text = text.replace(".$", "$.")
    text = text.replace("$", " $ ")
    text = text.replace("-", " - ")
    text = text.replace("%", "")
    text = text.replace("\\item", "")
    return text

In [5]:
def remove_figures_and_exercises(text):
    pattern = r"\\begin\{figure\}[\s\S]*?\\end\{figure\}"
    # Use the sub() function to remove all matches of the pattern
    text = re.sub(pattern, "", text, flags=re.DOTALL)
    pattern2 = r"\\begin\{exercises\}[\s\S].*?\\end\{exercises\}"
    # Use the sub() function to remove all matches of the pattern
    text = re.sub(pattern2, "", text,flags=re.DOTALL)
    pattern3 = r"\\begin\{minipage\}[\s\S].*?\\end\{minipage\}"
    # Use the sub() function to remove all matches of the pattern
    text = re.sub(pattern3, "", text,flags=re.DOTALL)
    pattern3 = r"\\begin\{center\}[\s\S].*?\\end\{center\}"
    return text

In [6]:
def remove_environments(text):
    pattern = r'\\begin\{[^\}]+\}'
    # Substitute the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    pattern2 = r'\\end\{[^\}]+\}'
    # Substitute the pattern with an empty string
    cleaned_text = re.sub(pattern2, '', cleaned_text)
    return cleaned_text

In [7]:
def remove_emphasis(text):
    text = re.sub(r"\\demph\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\emph\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\definend\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\textit\{([^{}]*)\}", r"\1", text)
    text = re.sub(r"\\textbf\{([^{}]*)\}", r"\1", text)
    return text

In [34]:
def remove_formatting(text):
    text = re.sub(r'~\\[^}]+}', ' $ X $ ', text)
    text = re.sub(r'\\nearby[^\}]+\}', ' $ X $ ', text)

    text = re.sub(r"\\chapter\{.*?\}", "", text)
    text = re.sub(r"\\section\{.*?\}", "", text)
    text = re.sub(r"\\subsection\{.*?\}", "", text)
    text = re.sub(r"\\subsectionoptional\{.*?\}", "", text)
    text = re.sub(r"\\label\{.*?\}", "", text)
    text = re.sub(r"\\ntn\{.*?\}", "", text)
    text = re.sub(r"\\index\{.*?\}", "", text)
    text = re.sub(r"\\ref\{.*?\}", "", text)
    text = re.sub(r"\\bref\{.*?\}", "", text)
    text = re.sub(r"\\eqref\{.*?\}", "", text)
    text = re.sub(r"\\cite\{.*?\}", "", text)

    return text

In [9]:
def prep_w_spaces(text):
    text = text.replace("\\\\", " \\")
    text = text.replace("\{", "{")
    text = text.replace("\}", " } ") 
    text = text.replace(",", ", ")
    return text

In [10]:
def list_fix(text):
    text = text.replace("1.", "(i)")
    text = text.replace("2.", "(ii)")
    text = text.replace("3.", "(iii)")
    text = text.replace("4.", "(iv)")
    text = text.replace("5.", "(v)")
    text = text.replace("6.", "(vi)")
    text = text.replace("7.", "(vii)")
    text = text.replace("8.", "(viii)")
    text = text.replace("9.", "(ix)")
    text = text.replace("10.", "(x)")

    return text

In [76]:
def final_sweep(text):
    words = [word.replace("\\", "") for word in text.split() if not word.startswith("Helvetica")]
    text = " ".join(words)
    text = text.replace('~', " ")
    return text

In [73]:
def clean_everything(text):
    text1 = remove_comments(text)
    text2 = clean_links(text1)
    text3 = remove_figures_and_exercises(text2)
    text4 = remove_simples(text3)
    text5 = remove_environments(text4)
    text6 = remove_emphasis(text5)
    text7 = remove_formatting(text6)
    text8 = prep_w_spaces(text7)
    text9 = list_fix(text8)
    text10 = final_sweep(text9)
    return text10

In [13]:
import spacy

from spacy_conll import init_parser

nlp = init_parser("en_core_web_sm", "spacy")
from spacy.language import Language
@Language.component("detextor")
def detextor(doc):
    dollar_indices = [index for index, token in enumerate(doc) if token.text == "$"]
    while len(dollar_indices) > 1:
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[dollar_indices[0]:dollar_indices[1] + 1])
        dollar_indices = [index for index, token in enumerate(doc) if token.text == "$"]
    return doc
# nlp.remove_pipe("detextor") 
# you might need to add the above line back in if you run this block more than once
nlp.add_pipe("detextor", after="tagger")           

<function __main__.detextor(doc)>

In [17]:
import os

# Loop through every file name in the folder
for filename in os.listdir("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/bct/tex"):
    # Construct the full path to the file
    full_path = os.path.join("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/bct/tex", filename)
    with open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/bct/bct_clean.txt","a") as f:
        f.write(clean_everything(open(full_path).read()) + '\n')


AttributeError: 'str' object has no attribute 'remove'

In [77]:
clean = open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/nlab/nlab_clean.txt","a")
clean.write(clean_everything(open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/nlab/nlab_mathgloss.txt","r").read()))

1211881

In [80]:
text = open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/nlab/nlab_clean.txt","r").readlines()
sent_lengths = []
with open("/Users/lucyhorowitz/Documents/GitHub/definition-extraction/textbooks/nlab/nlab.conllu", "a") as f:
        j = 1
        i=1
        for line in text:
            text2 = clean_everything(line)
            doc = nlp(text2)
            print(i)
            for sent in doc.sents:
                doc2 = nlp(sent.text)
                conll = doc2._.conll_str
                sent_lengths.append(len(doc2))
                f.write("# sent_id = " + str(j) + "\n")
                f.write("# text = " + sent.text + "\n")
                f.write(conll + "\n")
                j = j + 1
            i = i+1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [81]:
sents = open("textbooks/nlab/nlab_sents.txt","a")
with open("textbooks/nlab/nlab.conllu","r") as a:
    for line in a.readlines():
        if "# text = " in line:
            sents.write(line[9:])