## Using NLTK

In [1]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

In [2]:
fileName = 'ARMAGEDDON.txt'

file = open(fileName, "r", encoding = "UTF-8")
text = file.read()
file.close()

In [3]:
text = text.replace("\n", " ")

In [4]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentences = tokenizer.tokenize(text)

In [5]:
words = nltk.tokenize.word_tokenize(text)

In [6]:
words_with_pos = nltk.pos_tag(words)

In [7]:
stemmer = SnowballStemmer('english')

In [8]:
words = ['leaf', 'leaves', 'booking', 'writing', 'completed', 'stemming', 'skies']

In [9]:
stemmed_words = [stemmer.stem(word) for word in words]

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [12]:
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
words = nltk.tokenize.word_tokenize(text)

In [14]:
freq_dist = FreqDist(word.lower() for word in words)

In [15]:
freq_dist

FreqDist({'the': 23, 'and': 15, '.': 14, ',': 13, 'of': 10, 'a': 9, 'they': 7, 'to': 7, 'alan': 6, 'he': 6, ...})

In [16]:
words_with_frequencies = [(word, freq_dist[word]) for word in freq_dist.keys()]

In [17]:
sorted_words = sorted(words_with_frequencies, key=lambda tup:tup[1])

## Using spaCy

In [18]:
import spacy

In [19]:
fileName = 'ARMAGEDDON.txt'

file = open(fileName, "r", encoding = "UTF-8")
text = file.read()
file.close()

In [20]:
text = text.replace("\n", " ")

In [21]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
sentences = [sentence.text for sentence in doc.sents]

In [22]:
words = [token.text for token in doc]

In [23]:
pos = [token.pos_ for token in doc]

In [24]:
words_pos_tuples = list(zip(words, pos))