In [19]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
# only need to do these one time
# nltk.download("punkt")
# nltk.download("stopwords")

In [17]:
with open("preprocessed/lexi.txt", "r") as file:
    # returns one string with all of the text
    raw_text = file.read()

# now we need to do some pre-processing (more can be done after tokenizing)

# this removes any instance of [<number>] as some papers use this as a way of citing
clean_text = re.sub(r"\[\d+\]", "", raw_text)

# removes any instance of an in-text citation following any of these formats:
# (Smith & Johnson, 2019), (Smith, 2019), (Smith et al., 2019), (Smith & Johnson, 2019; James 2019)

# this still leaves in ones that have extra text before the ciation, like:
# (e.g., Smith et al., 2019). Only one out of the 8 preprocessed papers I looked at had extra citations like these
# so I'm sure it shouldn't affect the outcome too much
clean_text = re.sub(r"\((?:(?:[\w \.&]+\, )+[0-9]{4}[;|:]*\s*)+\)", "", clean_text)

# removes any URLs
clean_text = re.sub(r"http\S+", "", clean_text)

clean_text = re.sub(" +", " ", clean_text)
    
# this turns my chunk of text into a list of sentences
# some of the sentences aren't quite right, as I think it counts a sentence as words in between two
# sets of punctuation, so if some text got extracted that wasn't supposed to be there, it may get
# squished into a sentence
# I think that's just the nature of using a ML approach, it's not always going to be 100% accurate
sentences = sent_tokenize(clean_text)
print(sentences)

["Joachim Bingel' Gustavo H. Paetzold?", 'Ande Text simplification is a diverse task, or perhaps rather a family of tasks, with a number of different target audiences that different papers and research projects have focused on.', 'Among the most prominent target audiences are foreign language learners, for whom various approaches to simplifying text have been pursued, often focusing on lexical but also sentence-level simplification .', 'Other notable groups that have been specifically targeted in text simplification research include dyslexics , and the aphasic , for whom particularly long words and sentences, but also certain surface forms such as specific character combinations, may pose difficulties.', 'People on the autism spectrum have also been addressed, with the focus lying on reducing the amount of figurative expressions in a text or reducing syntactic complexity .', 'Reading beginners (both children and adults) are another group with very particular needs, and text simplificat

In [21]:
# remove punctuation and make all letters lowercase
clean_sentences = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]

stop_words = stopwords.words('english')

# Removes stop words (using the list of stop words from NLTK) and returns
# A list of lists, with each list containing the words in each sentence
sentence_tokens = [[words for words in word_tokenize(sentence) if words not in 
                    stop_words] for sentence in clean_sentences]
print(sentence_tokens)

[['joachim', 'bingel', 'gustavo', 'h', 'paetzold'], ['ande', 'text', 'simplification', 'diverse', 'task', 'perhaps', 'rather', 'family', 'tasks', 'number', 'different', 'target', 'audiences', 'different', 'papers', 'research', 'projects', 'focused'], ['among', 'prominent', 'target', 'audiences', 'foreign', 'language', 'learners', 'various', 'approaches', 'simplifying', 'text', 'pursued', 'often', 'focusing', 'lexical', 'also', 'sentencelevel', 'simplification'], ['notable', 'groups', 'specifically', 'targeted', 'text', 'simplification', 'research', 'include', 'dyslexics', 'aphasic', 'particularly', 'long', 'words', 'sentences', 'also', 'certain', 'surface', 'forms', 'specific', 'character', 'combinations', 'may', 'pose', 'difficulties'], ['people', 'autism', 'spectrum', 'also', 'addressed', 'focus', 'lying', 'reducing', 'amount', 'figurative', 'expressions', 'text', 'reducing', 'syntactic', 'complexity'], ['reading', 'beginners', 'children', 'adults', 'another', 'group', 'particular', 