# Text Preprocessing

In [9]:
text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed."
print(text)

So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed.


In [10]:
# Removing punctuation
import re
cleaned = re.sub('\W+', ' ', text)
print(cleaned)

So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise I went to the dentist the other day and sure enough I saw an angry one jump out of my dentist s bag within minutes of arriving She hardly even noticed 


In [11]:
# Tokenization = Breaking text into individual words
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized = word_tokenize(cleaned)
print(tokenized)

['So', 'many', 'squids', 'are', 'jumping', 'out', 'of', 'suitcases', 'these', 'days', 'that', 'you', 'can', 'barely', 'go', 'anywhere', 'without', 'seeing', 'one', 'burst', 'forth', 'from', 'a', 'tightly', 'packed', 'valise', 'I', 'went', 'to', 'the', 'dentist', 'the', 'other', 'day', 'and', 'sure', 'enough', 'I', 'saw', 'an', 'angry', 'one', 'jump', 'out', 'of', 'my', 'dentist', 's', 'bag', 'within', 'minutes', 'of', 'arriving', 'She', 'hardly', 'even', 'noticed']


In [12]:
# Stemming = Cop off word prefixes and suffixes ('singing' becomes 'sing')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokenized]
print(stemmed)

['So', 'mani', 'squid', 'are', 'jump', 'out', 'of', 'suitcas', 'these', 'day', 'that', 'you', 'can', 'bare', 'go', 'anywher', 'without', 'see', 'one', 'burst', 'forth', 'from', 'a', 'tightli', 'pack', 'valis', 'I', 'went', 'to', 'the', 'dentist', 'the', 'other', 'day', 'and', 'sure', 'enough', 'I', 'saw', 'an', 'angri', 'one', 'jump', 'out', 'of', 'my', 'dentist', 's', 'bag', 'within', 'minut', 'of', 'arriv', 'she', 'hardli', 'even', 'notic']


After stemming the words 'go' and 'went' are identified as different words. Also, what`s up with 'mani' and 'hardli'? A lemmatizer will fix these issues.

In [13]:
# Lemmatization = Bring words down to their root forms ('are' becomes 'be')
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
print(lemmatized)

['So', 'many', 'squid', 'are', 'jumping', 'out', 'of', 'suitcase', 'these', 'day', 'that', 'you', 'can', 'barely', 'go', 'anywhere', 'without', 'seeing', 'one', 'burst', 'forth', 'from', 'a', 'tightly', 'packed', 'valise', 'I', 'went', 'to', 'the', 'dentist', 'the', 'other', 'day', 'and', 'sure', 'enough', 'I', 'saw', 'an', 'angry', 'one', 'jump', 'out', 'of', 'my', 'dentist', 's', 'bag', 'within', 'minute', 'of', 'arriving', 'She', 'hardly', 'even', 'noticed']


Why are after the lemmatization still some verbs like 'went' conjugated? 'part_of_speech'-package will solve that issue! This will tell the lemmatizer what part of speech the word is.

In [14]:
# Define get_part_of_speech
from nltk.corpus import wordnet
from collections import Counter
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

# Apply get_part_of_speech
# from part_of_speech import get_part_of_speech
lemmatized = [lemmatizer.lemmatize(get_part_of_speech(token)) for token in tokenized]
print(lemmatized)

['r', 'a', 'n', 'v', 'v', 'v', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'r', 'v', 'r', 'n', 'v', 'n', 'v', 'r', 'n', 'n', 'r', 'v', 'n', 'n', 'v', 'n', 'n', 'n', 'n', 'a', 'n', 'n', 'a', 'n', 'n', 'v', 'n', 'a', 'n', 'v', 'v', 'n', 'n', 'n', 'n', 'n', 'r', 'n', 'n', 'v', 'n', 'r', 'r', 'v']


# Parsing Text
Parsing is a stage of NLP concerned with segmenting text based on syntax.
<br>
<br>
__Part-of-speech tagging (POS tagging)__
<br>
Identifies parts of speech as verbs, nouns or adjectives.
<br>
<br>
__Named entity recognition (NER)__
<br>
Identifies proper nouns: E.g. 'Natalia' vs. 'Berlin'.
<br>
<br>
__Dependency grammar__
<br>
Identifies relationship between words in a sentence.
<br>
<br>
__Regex parsing__
<br>
Combined with POS tagging one can identify specific phrase chunks.

In [6]:
text = "So many squids are jumping out of suitcases these days. You can barely go anywhere without seeing one. I went to the dentist the other day. Sure enough, I saw an angry one jump out of my dentist's bag. She hardly even noticed."
print(text)

So many squids are jumping out of suitcases these days. You can barely go anywhere without seeing one. I went to the dentist the other day. Sure enough, I saw an angry one jump out of my dentist's bag. She hardly even noticed.


In [7]:
# Make dependency parsing on the text
import spacy
dependency_parser = spacy.load('en_core_web_sm')
parsed_text = dependency_parser(text)
print(parsed_text)

So many squids are jumping out of suitcases these days. You can barely go anywhere without seeing one. I went to the dentist the other day. Sure enough, I saw an angry one jump out of my dentist's bag. She hardly even noticed.


In [10]:
# Depict dependencies
from nltk import Tree

def to_nltk_tree(node):
  if node.n_lefts + node.n_rights > 0:
    parsed_child_nodes = [to_nltk_tree(child) for child in node.children]
    return Tree(node.orth_, parsed_child_nodes)
  else:
    return node.orth_

for sent in parsed_text.sents:
    to_nltk_tree(sent.root).pretty_print()

jumping                
  _________|________________    
 |   |   squids    out      |  
 |   |     |        |       |   
 |   |    many      of     days
 |   |     |        |       |   
are  .     So   suitcases these

          go                       
  ________|____________________     
 |   |    |       |      |  without
 |   |    |       |      |     |    
 |   |    |       |      |   seeing
 |   |    |       |      |     |    
You can barely anywhere  .    one  

          went               
  _________|_________         
 |   |     to        |       
 |   |     |         |        
 |   |  dentist     day      
 |   |     |      ___|____    
 I   .    the   the     other

                   saw                           
  __________________|_________                    
 |   |   |    |              jump                
 |   |   |    |      _________|__________         
 |   |   |    |     |    |    |         out      
 |   |   |    |     |    |    |          |        
 |   | 

# Language Models - Bag-of-Words Approach
One can help computers make predictions about a language by training a language model on a corpus.
<br>
<br>
__Language Models__ are probabilistic models of a language. Model gets use to figure out the liklehood that a given sound, letter, word, or phrase will be used.
<br>
<br>
One of the most common (statistical) language models is known as __bag-of-words__. Bag-of-words does not have an order but a tally count of each instance for each word.
<br>
<br>
Bag-of-words can be a excellent way of looking at language when one wants to make prediction concerning topic or sentiment of a text. When grammar and word orderare irrelevant, this is probably a good model to use.

In [22]:
looking_glass_text = """
 However, the egg only got larger and larger, and more and more human: when she had come within a few yards of it, she saw that it had eyes and a nose and mouth; and when she had come close to it, she saw clearly that it was HUMPTY DUMPTY himself. It cant be anybody else! she said to herself. Im as certain of it, as if his name were written all over his face.

It might have been written a hundred times, easily, on that enormous face. Humpty Dumpty was sitting with his legs crossed, like a Turk, on the top of a high wallsuch a narrow one that Alice quite wondered how he could keep his balanceand, as his eyes were steadily fixed in the opposite direction, and he didnt take the least notice of her, she thought he must be a stuffed figure after all.

And how exactly like an egg he is! she said aloud, standing with her hands ready to catch him, for she was every moment expecting him to fall.

Its very provoking, Humpty Dumpty said after a long silence, looking away from Alice as he spoke, to be called an eggVery!

I said you looked like an egg, Sir, Alice gently explained. And some eggs are very pretty, you know she added, hoping to turn her remark into a sort of a compliment.

Some people, said Humpty Dumpty, looking away from her as usual, have no more sense than a baby!

Alice didnt know what to say to this: it wasnt at all like conversation, she thought, as he never said anything to her; in fact, his last remark was evidently addressed to a treeso she stood and softly repeated to herself:

     Humpty Dumpty sat on a wall:
     Humpty Dumpty had a great fall.
     All the Kings horses and all the Kings men
     Couldnt put Humpty Dumpty in his place again.

That last line is much too long for the poetry, she added, almost out loud, forgetting that Humpty Dumpty would hear her.

Dont stand there chattering to yourself like that, Humpty Dumpty said, looking at her for the first time, but tell me your name and your business.

My name is Alice, but

Its a stupid enough name! Humpty Dumpty interrupted impatiently. What does it mean?

Must a name mean something? Alice asked doubtfully.

Of course it must, Humpty Dumpty said with a short laugh: my name means the shape I amand a good handsome shape it is, too. With a name like yours, you might be any shape, almost.

Why do you sit out here all alone? said Alice, not wishing to begin an argument.

Why, because theres nobody with me! cried Humpty Dumpty. Did you think I didnt know the answer to that? Ask another.

Dont you think youd be safer down on the ground? Alice went on, not with any idea of making another riddle, but simply in her good-natured anxiety for the queer creature. That wall is so very narrow!

What tremendously easy riddles you ask! Humpty Dumpty growled out. Of course I dont think so! Why, if ever I did fall offwhich theres no chance ofbut if I did Here he pursed his lips and looked so solemn and grand that Alice could hardly help laughing. If I did fall, he went on, The King has promised mewith his very own mouthtoto

To send all his horses and all his men, Alice interrupted, rather unwisely.

Now I declare thats too bad! Humpty Dumpty cried, breaking into a sudden passion. Youve been listening at doorsand behind treesand down chimneysor you couldnt have known it!

I havent, indeed! Alice said very gently. Its in a book.

Ah, well! They may write such things in a book, Humpty Dumpty said in a calmer tone. Thats what you call a History of England, that is. Now, take a good look at me! Im one that has spoken to a King, I am: mayhap youll never see such another: and to show you Im not proud, you may shake hands with me! And he grinned almost from ear to ear, as he leant forwards (and as nearly as possible fell off the wall in doing so) and offered Alice his hand. She watched him a little anxiously as she took it. If he smiled much more, the ends of his mouth might meet behind, she thought: and then I dont know what would happen to his head! Im afraid it would come off!

Yes, all his horses and all his men, Humpty Dumpty went on. Theyd pick me up again in a minute, they would! However, this conversation is going on a little too fast: lets go back to the last remark but one.

Im afraid I cant quite remember it, Alice said very politely.

In that case we start fresh, said Humpty Dumpty, and its my turn to choose a subject (He talks about it just as if it was a game! thought Alice.) So heres a question for you. How old did you say you were?

Alice made a short calculation, and said Seven years and six months.

Wrong! Humpty Dumpty exclaimed triumphantly. You never said a word like it!

I though you meant How old are you? Alice explained.

If Id meant that, Id have said it, said Humpty Dumpty. 
"""
print(looking_glass_text[:500])


 However, the egg only got larger and larger, and more and more human: when she had come within a few yards of it, she saw that it had eyes and a nose and mouth; and when she had come close to it, she saw clearly that it was HUMPTY DUMPTY himself. It cant be anybody else! she said to herself. Im as certain of it, as if his name were written all over his face.

It might have been written a hundred times, easily, on that enormous face. Humpty Dumpty was sitting with his legs crossed, like a Turk,


In [14]:
# 'get_part_of_speech' will tell what part of speech the word is
from nltk.corpus import wordnet
from collections import Counter
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [21]:
# Removing punctuation
import re
cleaned = re.sub('\W+', ' ', looking_glass_text).lower()
print(cleaned[:500])

however the egg only got larger and larger and more and more human when she had come within a few yards of it she saw that it had eyes and a nose and mouth and when she had come close to it she saw clearly that it was humpty dumpty himself it cant be anybody else she said to herself im as certain of it as if his name were written all over his face it might have been written a hundred times easily on that enormous face humpty dumpty was sitting with his legs crossed like a turk on the top of a h


In [26]:
# Tokenization = Breaking text into individual words
from nltk.tokenize import word_tokenize
tokenized = word_tokenize(cleaned)
print(tokenized[:100])

['however', 'the', 'egg', 'only', 'got', 'larger', 'and', 'larger', 'and', 'more', 'and', 'more', 'human', 'when', 'she', 'had', 'come', 'within', 'a', 'few', 'yards', 'of', 'it', 'she', 'saw', 'that', 'it', 'had', 'eyes', 'and', 'a', 'nose', 'and', 'mouth', 'and', 'when', 'she', 'had', 'come', 'close', 'to', 'it', 'she', 'saw', 'clearly', 'that', 'it', 'was', 'humpty', 'dumpty', 'himself', 'it', 'cant', 'be', 'anybody', 'else', 'she', 'said', 'to', 'herself', 'im', 'as', 'certain', 'of', 'it', 'as', 'if', 'his', 'name', 'were', 'written', 'all', 'over', 'his', 'face', 'it', 'might', 'have', 'been', 'written', 'a', 'hundred', 'times', 'easily', 'on', 'that', 'enormous', 'face', 'humpty', 'dumpty', 'was', 'sitting', 'with', 'his', 'legs', 'crossed', 'like', 'a', 'turk', 'on']


In [30]:
# Stopword removal
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = stopwords.words('english')
filtered = [word for word in tokenized if word not in stop_words]
print(filtered[:100])

['however', 'egg', 'got', 'larger', 'larger', 'human', 'come', 'within', 'yards', 'saw', 'eyes', 'nose', 'mouth', 'come', 'close', 'saw', 'clearly', 'humpty', 'dumpty', 'cant', 'anybody', 'else', 'said', 'im', 'certain', 'name', 'written', 'face', 'might', 'written', 'hundred', 'times', 'easily', 'enormous', 'face', 'humpty', 'dumpty', 'sitting', 'legs', 'crossed', 'like', 'turk', 'top', 'high', 'wallsuch', 'narrow', 'one', 'alice', 'quite', 'wondered', 'could', 'keep', 'balanceand', 'eyes', 'steadily', 'fixed', 'opposite', 'direction', 'didnt', 'take', 'least', 'notice', 'thought', 'must', 'stuffed', 'figure', 'exactly', 'like', 'egg', 'said', 'aloud', 'standing', 'hands', 'ready', 'catch', 'every', 'moment', 'expecting', 'fall', 'provoking', 'humpty', 'dumpty', 'said', 'long', 'silence', 'looking', 'away', 'alice', 'spoke', 'called', 'eggvery', 'said', 'looked', 'like', 'egg', 'sir', 'alice', 'gently', 'explained', 'eggs']


In [32]:
# Lemmatization = Bring words down to their root forms ('are' becomes 'be')
from nltk.stem import WordNetLemmatizer
normalizer = WordNetLemmatizer()
normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in filtered]
print(normalized[:100])

['however', 'egg', 'get', 'large', 'large', 'human', 'come', 'within', 'yard', 'saw', 'eye', 'nose', 'mouth', 'come', 'close', 'saw', 'clearly', 'humpty', 'dumpty', 'cant', 'anybody', 'else', 'say', 'im', 'certain', 'name', 'write', 'face', 'might', 'write', 'hundred', 'time', 'easily', 'enormous', 'face', 'humpty', 'dumpty', 'sit', 'leg', 'cross', 'like', 'turk', 'top', 'high', 'wallsuch', 'narrow', 'one', 'alice', 'quite', 'wonder', 'could', 'keep', 'balanceand', 'eye', 'steadily', 'fix', 'opposite', 'direction', 'didnt', 'take', 'least', 'notice', 'think', 'must', 'stuff', 'figure', 'exactly', 'like', 'egg', 'say', 'aloud', 'stand', 'hand', 'ready', 'catch', 'every', 'moment', 'expect', 'fall', 'provoke', 'humpty', 'dumpty', 'say', 'long', 'silence', 'look', 'away', 'alice', 'speak', 'call', 'eggvery', 'say', 'look', 'like', 'egg', 'sir', 'alice', 'gently', 'explain', 'egg']


In [35]:
# Create bag-of-words out of normalized text
bag_of_looking_glass_words = Counter(normalized)
print(bag_of_looking_glass_words)

Counter({'humpty': 19, 'dumpty': 19, 'say': 19, 'alice': 16, 'name': 7, 'like': 7, 'think': 7, 'look': 6, 'im': 5, 'know': 5, 'mean': 5, 'go': 5, 'egg': 4, 'fall': 4, 'king': 4, 'would': 4, 'dont': 4, 'come': 3, 'write': 3, 'might': 3, 'sit': 3, 'one': 3, 'didnt': 3, 'take': 3, 'must': 3, 'stand': 3, 'hand': 3, 'remark': 3, 'never': 3, 'last': 3, 'wall': 3, 'horse': 3, 'men': 3, 'almost': 3, 'ask': 3, 'shape': 3, 'good': 3, 'another': 3, 'however': 2, 'large': 2, 'saw': 2, 'eye': 2, 'mouth': 2, 'cant': 2, 'face': 2, 'time': 2, 'narrow': 2, 'quite': 2, 'could': 2, 'long': 2, 'away': 2, 'speak': 2, 'call': 2, 'gently': 2, 'explain': 2, 'add': 2, 'turn': 2, 'conversation': 2, 'couldnt': 2, 'much': 2, 'interrupt': 2, 'course': 2, 'short': 2, 'laugh': 2, 'there': 2, 'cry': 2, 'make': 2, 'riddle': 2, 'thats': 2, 'behind': 2, 'book': 2, 'may': 2, 'ear': 2, 'little': 2, 'afraid': 2, 'old': 2, 'id': 2, 'get': 1, 'human': 1, 'within': 1, 'yard': 1, 'nose': 1, 'close': 1, 'clearly': 1, 'anybody':

# Language Models - N-Grams and NLM
Unlike bag-of-words, n-grams models considers a sequence of some number (n) units and calculates the probability of each unit in a body of language given the preceding sequence of length n.
<br>
<br>
N-gram probabilities with larger (n) values can be impressive at language prediction.
<br>
<br>
But there are a couple problems with the n-gram model:
<br>
1. How can a model make sense from a sentence with yet unseen words. The same issue also pretains for bag-of-words. Language smoothig can help adjusting probabilities for unknown words.
<br>
2. For a model that accurately predicts human language patterns, one want to be (n) as large as possible. As the sequence length grows, the number of examples of each sequence within your training corpus shrinks.
<br>
<br>
NLM equals Neural Language Models.

In [1]:
# Loadtext dataset
from looking_glass import looking_glass_full_text
print(looking_glass_full_text[:100])


 CHAPTER I. Looking-Glass house

One thing was certain, that the white kitten had had nothing to do


In [2]:
# Removing punctuation
import re
cleaned = re.sub('\W+', ' ', looking_glass_full_text).lower()
print(cleaned[:100])

chapter i looking glass house one thing was certain that the white kitten had had nothing to do wit


In [3]:
# Tokenization = Breaking text into individual words
from nltk.tokenize import word_tokenize
tokenized = word_tokenize(cleaned)
print(tokenized[:100])

['chapter', 'i', 'looking', 'glass', 'house', 'one', 'thing', 'was', 'certain', 'that', 'the', 'white', 'kitten', 'had', 'had', 'nothing', 'to', 'do', 'with', 'it', 'it', 'was', 'the', 'black', 'kittens', 'fault', 'entirely', 'for', 'the', 'white', 'kitten', 'had', 'been', 'having', 'its', 'face', 'washed', 'by', 'the', 'old', 'cat', 'for', 'the', 'last', 'quarter', 'of', 'an', 'hour', 'and', 'bearing', 'it', 'pretty', 'well', 'considering', 'so', 'you', 'see', 'that', 'it', 'couldnt', 'have', 'had', 'any', 'hand', 'in', 'the', 'mischief', 'the', 'way', 'dinah', 'washed', 'her', 'childrens', 'faces', 'was', 'this', 'first', 'she', 'held', 'the', 'poor', 'thing', 'down', 'by', 'its', 'ear', 'with', 'one', 'paw', 'and', 'then', 'with', 'the', 'other', 'paw', 'she', 'rubbed', 'its', 'face', 'all']


In [5]:
# Change the n value to 3
from nltk.util import ngrams
from collections import Counter
looking_glass_trigrams = ngrams(tokenized, 3)
looking_glass_trigrams_frequency = Counter(looking_glass_trigrams)
print(looking_glass_trigrams_frequency.most_common(10))

[(('the', 'red', 'queen'), 54), (('the', 'white', 'queen'), 31), (('said', 'in', 'a'), 21), (('she', 'went', 'on'), 18), (('said', 'the', 'red'), 17), (('thought', 'to', 'herself'), 16), (('the', 'queen', 'said'), 16), (('said', 'to', 'herself'), 14), (('said', 'humpty', 'dumpty'), 14), (('the', 'knight', 'said'), 14)]
