# Dividing text into sentances using nltk

In [1]:
import nltk
# nltk.download('punkt')
filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()
print(text)
print()

text = text.replace("\n", " ")
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentences = tokenizer.tokenize(text)

for (i, sentence) in enumerate(sentences):
    print("Sentence", i + 1, ":", sentence)

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He never spoke of the softer passions, save with a gibe
and a sneer. They were admirable things for the observer—excellent for
drawing the veil from men’s motives and actions. But for the trained
reasoner to admit such intrusions into his own delicate and finely
adjusted temperament was to introduce a distracting factor which might
throw a doubt upon all his mental results. Grit in a sensitive
instrument, or a crack in one of his own high-power lenses, would not
be more disturbing than a strong em

# Dividing text into sentances using spacy

In [2]:
import spacy

#!python -m spacy download en_core_web_sm

filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()
print(text)

text = text.replace("\n", " ")
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
sentences = [sentence.text for sentence in doc.sents]
print(sentences)

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He never spoke of the softer passions, save with a gibe
and a sneer. They were admirable things for the observer—excellent for
drawing the veil from men’s motives and actions. But for the trained
reasoner to admit such intrusions into his own delicate and finely
adjusted temperament was to introduce a distracting factor which might
throw a doubt upon all his mental results. Grit in a sensitive
instrument, or a crack in one of his own high-power lenses, would not
be more disturbing than a strong em

Dividing sentences into words – tokenization

In [3]:
import nltk

filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()

text = text.replace("\n", " ")
words = nltk.tokenize.word_tokenize(text)
print(words)


tweet = "@EmpireStateBldg Central Park Tower is reaaaally hiiiigh"

words = nltk.tokenize.casual.casual_tokenize(tweet,
                                                preserve_case=True,
                                                reduce_len=True,
                                                strip_handles=True)

print(words)

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_the_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', '.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', '.', 'All', 'emotions', ',', 'and', 'that', 'one', 'particularly', ',', 'were', 'abhorrent', 'to', 'his', 'cold', ',', 'precise', 'but', 'admirably', 'balanced', 'mind', '.', 'He', 'was', ',', 'I', 'take', 'it', ',', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', ',', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', '.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', ',', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', '.', 'They', 'were', 'admirable', 'things', 'for', 'the', 'observe

Parts of speech tagging

In [4]:
import spacy

filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()

text = text.replace("\n", " ")
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
words = [token.text for token in doc]
pos = [token.pos_ for token in doc]
word_pos_tuples = [(token.text, token.pos_) for token in doc]
print(words)
print(pos)

['To', 'Sherlock', 'Holmes', 'she', 'is', 'always', '_', 'the', '_', 'woman', '.', 'I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.', 'In', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', '.', 'It', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'Irene', 'Adler', '.', 'All', 'emotions', ',', 'and', 'that', 'one', 'particularly', ',', 'were', 'abhorrent', 'to', 'his', 'cold', ',', 'precise', 'but', 'admirably', 'balanced', 'mind', '.', 'He', 'was', ',', 'I', 'take', 'it', ',', 'the', 'most', 'perfect', 'reasoning', 'and', 'observing', 'machine', 'that', 'the', 'world', 'has', 'seen', ',', 'but', 'as', 'a', 'lover', 'he', 'would', 'have', 'placed', 'himself', 'in', 'a', 'false', 'position', '.', 'He', 'never', 'spoke', 'of', 'the', 'softer', 'passions', ',', 'save', 'with', 'a', 'gibe', 'and', 'a', 'sneer', '.', 'They', 'were', 'admirable', 'things', 'for', 'the', 

Word stemming

In [5]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
words = ['leaf', 'leaves', 'booking', 'writing', 'completed', 'stemming', 'skies']

stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['leaf', 'leav', 'book', 'write', 'complet', 'stem', 'sky']


lemmatization

In [6]:
from nltk.stem import WordNetLemmatizer
# import nltk
# nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
words = ['duck', 'geese', 'cats', 'books']

lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(lemmatized_words)

print(lemmatizer.lemmatize('loved', 'v')) # part of speech tag: v for verb
print(lemmatizer.lemmatize('worse', 'a')) # part of speech tag: a for adjective

['duck', 'goose', 'cat', 'book']
love
bad


combine similar words

In [7]:
import nltk

nltk.download('averaged_perceptron_tagger')

def pos_tag_nltk(text):
    tokens = nltk.tokenize.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    return tagged_tokens

def read_text_file(filename):
    file = open(filename, "r", encoding="utf-8")
    text = file.read()
    return text

pos_mapping = {'JJ':'a', 'JJR':'a', 'JJS':'a', 'NN':'n',
    'NNS':'n', 'VBD':'v', 'VBG':'v', 'VBN':'v', 'VBP':'v', 'VBZ':'v'}
accepted_pos = {'a', 'v', 'n'}

def lemmatize_long_text(text):
    words = pos_tag_nltk(text)
    words = [(word_tuple[0], pos_mapping[word_tuple[1]] if word_tuple[1] in pos_mapping.keys()
              else word_tuple[0], word_tuple[1]) for word_tuple in words]
    words = [(lemmatizer.lemmatize(word_tuple[0]) if word_tuple[1] in accepted_pos else word_tuple[0], word_tuple[1]) for word_tuple in words]
    return words


text = read_text_file("sherlock_holmes_1.txt")
lem_words = lemmatize_long_text(text)
print(lem_words)

[('To', 'To'), ('Sherlock', 'Sherlock'), ('Holmes', 'Holmes'), ('she', 'she'), ('is', 'v'), ('always', 'always'), ('_the_', 'a'), ('woman', 'n'), ('.', '.'), ('I', 'I'), ('have', 'v'), ('seldom', 'v'), ('heard', 'heard'), ('him', 'him'), ('mention', 'mention'), ('her', 'her'), ('under', 'under'), ('any', 'any'), ('other', 'a'), ('name', 'n'), ('.', '.'), ('In', 'In'), ('his', 'his'), ('eye', 'n'), ('she', 'she'), ('eclipse', 'v'), ('and', 'and'), ('predominates', 'v'), ('the', 'the'), ('whole', 'n'), ('of', 'of'), ('her', 'her'), ('sex', 'n'), ('.', '.'), ('It', 'It'), ('wa', 'v'), ('not', 'not'), ('that', 'that'), ('he', 'he'), ('felt', 'v'), ('any', 'any'), ('emotion', 'n'), ('akin', 'n'), ('to', 'to'), ('love', 'love'), ('for', 'for'), ('Irene', 'Irene'), ('Adler', 'Adler'), ('.', '.'), ('All', 'All'), ('emotion', 'n'), (',', ','), ('and', 'and'), ('that', 'that'), ('one', 'one'), ('particularly', 'particularly'), (',', ','), ('were', 'v'), ('abhorrent', 'a'), ('to', 'to'), ('his', 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/a/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Removing stop words

In [8]:
import csv
import nltk
# nltk.download('stopwords')

csv_file = "stopwords.csv"

with open(csv_file, "r") as f:
    reader = csv.reader(f)
    stopwords = [row[0] for row in reader]
    
# stopwords = nltk.corpus.stopwords.words("english")
# print(stopwords)

filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()

text = text.replace("\n", " ")
words = nltk.tokenize.word_tokenize(text)

words = [word for word in words if word.lower() not in stopwords]
print(words)

['Sherlock', 'Holmes', '_the_', 'woman', '.', 'seldom', 'heard', 'mention', 'name', '.', 'eyes', 'eclipses', 'predominates', 'whole', 'sex', '.', 'felt', 'emotion', 'akin', 'love', 'Irene', 'Adler', '.', 'emotions', ',', ',', 'abhorrent', 'cold', ',', 'precise', 'admirably', 'balanced', 'mind', '.', ',', 'take', ',', 'perfect', 'reasoning', 'observing', 'machine', 'world', ',', 'lover', 'placed', 'false', 'position', '.', 'spoke', 'softer', 'passions', ',', 'save', 'gibe', 'sneer', '.', 'admirable', 'observer—excellent', 'drawing', 'veil', 'men', '’', 'motives', 'actions', '.', 'trained', 'reasoner', 'admit', 'intrusions', 'own', 'delicate', 'finely', 'adjusted', 'temperament', 'introduce', 'distracting', 'factor', 'throw', 'doubt', 'mental', 'results', '.', 'Grit', 'sensitive', 'instrument', ',', 'crack', 'own', 'high-power', 'lenses', ',', 'disturbing', 'strong', 'emotion', 'nature', '.', 'woman', ',', 'woman', 'late', 'Irene', 'Adler', ',', 'dubious', 'questionable', 'memory', '.']


Removing stopwords - FreqDist

In [9]:
import nltk
from nltk.probability import FreqDist

filename = "sherlock_holmes_1.txt"
file = open(filename, "r", encoding="utf-8")
text = file.read()

text = text.replace("\n", " ")

words = nltk.tokenize.word_tokenize(text)

freq_dist = FreqDist(word.lower() for word in words)

print(freq_dist.most_common(10))

words_with_frequencies = [(word, freq_dist[word]) for word in freq_dist.keys()]

print(words_with_frequencies)

sorted_words = sorted(words_with_frequencies, key=lambda x: x[1])

print(sorted_words)

# Option 1: use the n most frequent words as stopwords
stopwords = [word[0] for word in sorted_words if word[1] > 10]

print(stopwords)

# Option 2: use the n% most frequent words as stopwords
length_cutoff = int(0.02 * len(sorted_words))
stopwords = [tuple[0] for tuple in sorted_words[-length_cutoff:]]
print(stopwords)

[('.', 11), (',', 11), ('a', 10), ('and', 9), ('the', 8), ('to', 6), ('his', 6), ('in', 5), ('was', 5), ('of', 4)]
[('to', 6), ('sherlock', 1), ('holmes', 1), ('she', 2), ('is', 1), ('always', 1), ('_the_', 1), ('woman', 3), ('.', 11), ('i', 2), ('have', 2), ('seldom', 1), ('heard', 1), ('him', 2), ('mention', 1), ('her', 2), ('under', 1), ('any', 2), ('other', 1), ('name', 1), ('in', 5), ('his', 6), ('eyes', 1), ('eclipses', 1), ('and', 9), ('predominates', 1), ('the', 8), ('whole', 1), ('of', 4), ('sex', 1), ('it', 2), ('was', 5), ('not', 2), ('that', 4), ('he', 4), ('felt', 1), ('emotion', 2), ('akin', 1), ('love', 1), ('for', 4), ('irene', 2), ('adler', 2), ('all', 2), ('emotions', 1), (',', 11), ('one', 3), ('particularly', 1), ('were', 2), ('abhorrent', 1), ('cold', 1), ('precise', 1), ('but', 4), ('admirably', 1), ('balanced', 1), ('mind', 1), ('take', 1), ('most', 1), ('perfect', 1), ('reasoning', 1), ('observing', 1), ('machine', 1), ('world', 1), ('has', 1), ('seen', 1), ('as

Counting nouns – plural and singular nouns

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import inflect
from nltk import pos_tag

# nltk.download('averaged_perceptron_tagger')

file = open("sherlock_holmes_1.txt", "r", encoding="utf-8")
text = file.read()

text = text.replace("\n", " ")
words_with_pos = pos_tag(word_tokenize(text))

print(words_with_pos)

def get_nouns(words_with_pos):
    nouns_set = ['NN', 'NNS']
    nouns = [word for word in words_with_pos if word[1] in nouns_set]
    return nouns

def is_plural_nltk(noun_info):
    pos = noun_info[1]
    if pos == 'NNS':
        return True
    else:
        return False

def is_plural_wn(noun):
    wnl = WordNetLemmatizer()
    lemma = wnl.lemmatize(noun, 'n')
    plural = True if noun is not lemma else False
    
    return plural

def get_plural(singular_noun):
    p = inflect.engine()
    plural = p.plural(singular_noun)
    return plural

def get_singular(plural_noun):
    p = inflect.engine()
    singular = p.singular_noun(plural_noun)
    return singular
    

[('To', 'TO'), ('Sherlock', 'NNP'), ('Holmes', 'NNP'), ('she', 'PRP'), ('is', 'VBZ'), ('always', 'RB'), ('_the_', 'JJ'), ('woman', 'NN'), ('.', '.'), ('I', 'PRP'), ('have', 'VBP'), ('seldom', 'VBN'), ('heard', 'RB'), ('him', 'PRP'), ('mention', 'VB'), ('her', 'PRP'), ('under', 'IN'), ('any', 'DT'), ('other', 'JJ'), ('name', 'NN'), ('.', '.'), ('In', 'IN'), ('his', 'PRP$'), ('eyes', 'NNS'), ('she', 'PRP'), ('eclipses', 'VBZ'), ('and', 'CC'), ('predominates', 'VBZ'), ('the', 'DT'), ('whole', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('sex', 'NN'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('not', 'RB'), ('that', 'IN'), ('he', 'PRP'), ('felt', 'VBD'), ('any', 'DT'), ('emotion', 'NN'), ('akin', 'NN'), ('to', 'TO'), ('love', 'VB'), ('for', 'IN'), ('Irene', 'NNP'), ('Adler', 'NNP'), ('.', '.'), ('All', 'DT'), ('emotions', 'NNS'), (',', ','), ('and', 'CC'), ('that', 'IN'), ('one', 'CD'), ('particularly', 'RB'), (',', ','), ('were', 'VBD'), ('abhorrent', 'JJ'), ('to', 'TO'), ('his', 'PRP$'), ('co

Get dependency parse

In [11]:
import spacy

sentence = 'I have seldom heard him mention her under any other name.'

nlp = spacy.load("en_core_web_sm")

doc = nlp(sentence)

# for token in doc:
#     print(token.text, '\t', token.dep_, "\t", spacy.explain(token.dep_))
    
    
# for token in doc:
#     print(token.text)
#     ancestors = [t.text for t in token.ancestors]
#     print(ancestors)
    
# for token in doc:
#     print(token.text)
#     children = [t.text for t in token.children]
#     print(children)
    
# for token in doc:
#     print(token.text)
#     subtree = [t.text for t in token.subtree]
#     print(subtree)

Splitting sentences into clauses

In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "He eats cheese, but he won't eat ice cream."
doc = nlp(sentence)

for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    print(token.text, "\t", token.i, "\t", token.pos_, "\t", token.dep_, "\t", ancestors, "\t", children)

def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if token.dep_ == 'ROOT':
            root_token = token
            break
    return root_token

root_token = find_root_of_sentence(doc)

def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)
        if (token.pos_ == 'VERB' and len(ancestors) == 1 and ancestors[0] == root_token):
            other_verbs.append(token)
    return other_verbs

other_verbs = find_other_verbs(doc, root_token)

def get_clause_token_span_for_verb(verb, doc, all_verbs):
    fist_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    for child in this_verb_children:
        if (child not in all_verbs):
            if (child.i < fist_token_index):
                fist_token_index = child.i
            if (child.i > last_token_index):
                last_token_index = child.i
    
    return (fist_token_index, last_token_index)

token_spans = []
all_verbs = [root_token] + other_verbs
for other_verb in all_verbs:
    (first_token_index, last_token_index) = get_clause_token_span_for_verb(other_verb, doc, all_verbs)
    token_spans.append((first_token_index, last_token_index))
    
sentence_clauses = []
for token_span in token_spans:
    start = token_span[0]
    end = token_span[1]
    if (start < end):
        clause = doc[start:end]
        sentence_clauses.append(clause)

sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])

causes_text = [clause.text for clause in sentence_clauses]

print(causes_text)
           
    

He 	 0 	 PRON 	 nsubj 	 ['eats'] 	 []
eats 	 1 	 VERB 	 ROOT 	 [] 	 ['He', 'cheese', ',', 'but', 'eat']
cheese 	 2 	 NOUN 	 dobj 	 ['eats'] 	 []
, 	 3 	 PUNCT 	 punct 	 ['eats'] 	 []
but 	 4 	 CCONJ 	 cc 	 ['eats'] 	 []
he 	 5 	 PRON 	 nsubj 	 ['eat', 'eats'] 	 []
wo 	 6 	 AUX 	 aux 	 ['eat', 'eats'] 	 []
n't 	 7 	 PART 	 neg 	 ['eat', 'eats'] 	 []
eat 	 8 	 VERB 	 conj 	 ['eats'] 	 ['he', 'wo', "n't", 'cream', '.']
ice 	 9 	 NOUN 	 compound 	 ['cream', 'eat', 'eats'] 	 []
cream 	 10 	 NOUN 	 dobj 	 ['eat', 'eats'] 	 ['ice']
. 	 11 	 PUNCT 	 punct 	 ['eat', 'eats'] 	 []
['He eats cheese,', "he won't eat ice cream"]
