In [22]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import re
# only need to do these one time
# nltk.download("punkt")
# nltk.download("stopwords")

In [34]:
with open("preprocessed/textrank.txt", "r") as file:
    # returns one string with all of the text
    raw_text = file.read()
file.close
# now we need to do some pre-processing (more can be done after tokenizing)

# this removes any instance of [<number>] as some papers use this as a way of citing
clean_text = re.sub(r"\[\d+\]", "", raw_text)

# removes any instance of an in-text citation following any of these formats:
# (Smith & Johnson, 2019), (Smith, 2019), (Smith et al., 2019), (Smith & Johnson, 2019; James, 2019)

# this still leaves in ones that have extra text before the ciation, like:
# (e.g., Smith et al., 2019). Only one out of the 8 preprocessed papers I looked at had extra citations like these
# so I'm sure it shouldn't affect the outcome too much
clean_text = re.sub(r"\((?:(?:[\w \.&]+\, )+[0-9]{4}[;|:]*\s*)+\)", "", clean_text)

# need to remove instances of citations within sentences (e.g "Smith et al. (2018) said that....")
# as these cause the sentences to get split up where they aren't supposed to
# also need to remove numbers

# removes any URLs
clean_text = re.sub(r"http\S+", "", clean_text)

# removes any additional white space (e.g: "I like      cats   .") 
clean_text = re.sub(" +", " ", clean_text)
    
# this turns my chunk of text into a list of sentences
# some of the sentences aren't quite right, as I think it counts a sentence as words in between two
# sets of punctuation, so if some text got extracted that wasn't supposed to be there, it may get
# squished into a sentence
# I think that's just the nature of using a ML approach, it's not always going to be 100% accurate
sentences = sent_tokenize(clean_text)
print(sentences)

['Rada Mihalcea and Paul Tarau Department of Computer Science University of North Texas {rada,tarau} @cs.unt.edu Rada Mihalcea and Paul Tarau In this paper, we introduce TextRank — a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications.', 'In particular, we propose two innovative unsupervised methods for keyword and sentence extraction, and show that the results obtained compare favorably with previously published results on established benchmarks.', 'Graph-based ranking algorithms like Kleinberg’s HITS algorithm or Google’s PageRank have been successfully used in citation analysis, social networks, and the analysis of the link-structure of the World Wide Web.', 'Arguably, these algorithms can be singled out as key elements of the paradigm-shift triggered in the field of Web search technology, by providing a Web page ranking mechanism that relies on the collective knowledge of Web architects rather than individ

In [27]:
# remove punctuation and make all letters lowercase
clean_sentences = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]

stop_words = stopwords.words('english')

# Removes stop words (using the list of stop words from NLTK) and returns
# A list of lists, with each list containing the words in each sentence
sentence_tokens = [[words for words in word_tokenize(sentence) if words not in 
                    stop_words] for sentence in clean_sentences]
print(sentence_tokens)

[['rada', 'mihalcea', 'paul', 'tarau', 'department', 'computer', 'science', 'university', 'north', 'texas', 'radatarau', 'csuntedu', 'rada', 'mihalcea', 'paul', 'tarau', 'paper', 'introduce', 'textrank', 'graphbased', 'ranking', 'model', 'text', 'processing', 'show', 'model', 'successfully', 'used', 'natural', 'language', 'applications'], ['particular', 'propose', 'two', 'innovative', 'unsupervised', 'methods', 'keyword', 'sentence', 'extraction', 'show', 'results', 'obtained', 'compare', 'favorably', 'previously', 'published', 'results', 'established', 'benchmarks'], ['graphbased', 'ranking', 'algorithms', 'like', 'kleinbergs', 'hits', 'algorithm', 'googles', 'pagerank', 'successfully', 'used', 'citation', 'analysis', 'social', 'networks', 'analysis', 'linkstructure', 'world', 'wide', 'web'], ['arguably', 'algorithms', 'singled', 'key', 'elements', 'paradigmshift', 'triggered', 'field', 'web', 'search', 'technology', 'providing', 'web', 'page', 'ranking', 'mechanism', 'relies', 'colle

In [33]:
model = Word2Vec(sentence_tokens)
