In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import re
# only need to do these one time
# nltk.download("punkt")
# nltk.download("stopwords")

In [23]:
with open("preprocessed/sleep.txt", "r") as file:
    # returns one string with all of the text
    raw_text = file.read()
file.close()
# now we need to do some pre-processing (more can be done after tokenizing)

# this removes any instance of [<number>] as some papers use this as a way of citing
clean_text = re.sub(r"\[\d+\]", "", raw_text)

# removes any instance of an in-text citation following any of these formats:
# (Smith & Johnson, 2019), (Smith, 2019), (Smith et al., 2019), (Smith & Johnson, 2019; James, 2019)

# this still leaves in ones that have extra text before the ciation, like:
# (e.g., Smith et al., 2019). Only one out of the 8 preprocessed papers I looked at had extra citations like these
# so I'm sure it shouldn't affect the outcome too much
clean_text = re.sub(r"\((?:(?:[\w \.&]+\, )+[0-9]{4}[;|:]*\s*)+\)", "", clean_text)

# need to remove instances of citations within sentences (e.g "Smith et al. (2018) said that....")
# as these cause the sentences to get split up where they aren't supposed to
clean_text = re.sub("(et al.)", "et al", clean_text)

# also need to remove numbers, includes decimals
clean_text = re.sub(r"\d+\.*", " ", clean_text)

# removes any URLs
clean_text = re.sub(r"http\S+", "", clean_text)
clean_text = re.sub(r"www\.\S+", "", clean_text)

# removes any additional white space (e.g: "I like      cats   .") 
clean_text = re.sub(" +", " ", clean_text)
    
# this turns my chunk of text into a list of sentences
# some of the sentences aren't quite right, as I think it counts a sentence as words in between two
# sets of punctuation, so if some text got extracted that wasn't supposed to be there, it may get
# squished into a sentence
# I think that's just the nature of using a ML approach, it's not always going to be 100% accurate
sentences = sent_tokenize(clean_text)
#print(sentences)

#raw_sentences = sent_tokenize(raw_text)


In [9]:
# remove punctuation and make all letters lowercase
clean_sentences = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]

stop_words = stopwords.words('english')

# Removes stop words (using the list of stop words from NLTK) and returns
# A list of lists, with each list containing the words in each sentence
sentence_tokens = [[words for words in word_tokenize(sentence) if words not in 
                    stop_words] for sentence in clean_sentences]
print(sentence_tokens)

[['pransh', 'khemka', 'puja', 'dhanuka', 'radhika', 'bhutta', 'raghav', 'narang', 'raj', 'jakharia', 'narsee', 'monje', 'school', 'management', 'studies', 'mumbai', 'anil', 'surendra', 'school', 'commerce', 'mumbai', 'maharashtra', 'daytime', 'tiredness', 'unpredictable', 'sleep', 'schedules', 'lack', 'sleep', 'exceptionally', 'predominant', 'among', 'school', 'college', 'understudies'], ['outcomes', 'lack', 'sleep', 'daytime', 'sluggishness', 'particularly', 'risky', 'grads', 'may', 'end', 'lower', 'grades', 'expanded', 'danger', 'terrible', 'academic', 'performance', 'traded', 'learning', 'mood', 'swings', 'expanded', 'danger', 'liquor', 'drugs'], ['paper', 'surveys', 'situation', 'lack', 'sleep', 'among', 'college', 'understudies', 'contributing', 'factors', 'bring', 'lack', 'sleep', 'manner', 'significance', 'sleep', 'better', 'learning', 'memory'], ['paper', 'proposes', 'tending', 'sleep', 'issues', 'arent', 'frequently', 'considered', 'hazard', 'factor', 'depression', 'academic',

In [33]:
model = Word2Vec(sentence_tokens)
