# Course: Natural Language Processing
## Week 9: N-Grams and Collocations

Laboratory: Tweets Classification - Part III (Changing Data Representation)

**Author:** Andrés Felipe Zapata Palacio  


In [None]:
import nltk

# Contains different sample datasets
from nltk.corpus import gutenberg

# This is required to visualize Bag of Words
import pandas as pd

# Regular Expressions for Text Cleanning
import re


# NTLK Dependencies for Text Cleaning and Text Processing
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') # POS Tags
nltk.download('universal_tagset')           # POS Tags
nltk.download('stopwords')
pass

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 1. Load the Example Book

![](https://images.cdn1.buscalibre.com/fit-in/360x360/26/52/26527ee19219f760ca567acf15eafc36.jpg)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

book = nltk.corpus.gutenberg.raw('austen-emma.txt')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopWords = nltk.corpus.stopwords.words('english')
myOwnStopWords = ['shall', 'rather']

stopWords.extend(myOwnStopWords)

def preprocessDocument(text):
  text = text.lower()
  text = re.sub('\d+',' ', text);
  text = re.sub('[_*#!$%&\'\"()\[\]\-+?\/]+',' ', text)
  return text

def tokenizeDocument(text):
  tokens = word_tokenize(text)
  return tokens

In [None]:
cleanedBook = preprocessDocument(book)
words = tokenizeDocument(cleanedBook)

In [None]:
from nltk.collocations import BigramCollocationFinder

pmi = nltk.collocations.BigramAssocMeasures().pmi
finder = BigramCollocationFinder.from_words(words)
importantBigrams = finder.nbest(pmi, 30)
importantBigrams

[('adequate', 'restoratives'),
 ('al', 'fresco'),
 ('amor', 'patriae'),
 ('baly', 'craig'),
 ('base', 'aspersion'),
 ('beet', 'root'),
 ('bulky', 'forms'),
 ('carte', 'blanche'),
 ('cherries', 'currants'),
 ('christened', 'catherine'),
 ('coarser', 'featured'),
 ('comments', 'undoubting'),
 ('dated', 'sept.'),
 ('de', 'genlis'),
 ('designedly', 'suppress'),
 ('dexterously', 'throwing'),
 ('dispiriting', 'cogitation'),
 ('eatable', 'hautboys'),
 ('en', 'passant'),
 ('fearing', 'adoring'),
 ('fore', 'shortening'),
 ('gold', 'reticule'),
 ('goodly', 'heritage'),
 ('hood', 'wink'),
 ('idlest', 'haunts'),
 ('inconveniently', 'shy'),
 ('infamous', 'fraud'),
 ('inspect', 'anything'),
 ('jeffereys', 'clara'),
 ('knight', 'errantry')]

In [None]:
def filterNgrams(frequencies, minFreq):
  cols = ['ngram','freq']
  df = pd.DataFrame(columns=cols)
  for ngram in frequencies:
    frequency = frequencies[ngram]
    if frequency < minFreq:
      continue
    df = df.append({'ngram':ngram, 'freq':frequency}, ignore_index=True)
  return df.sort_values(by='freq', axis=0, ascending=False)

In [None]:
bigrams = list(nltk.bigrams(words))
frequencies = nltk.FreqDist(bigrams)
minFreq = 40

filterNgrams(frequencies, minFreq)

In [None]:
from nltk.collocations import TrigramCollocationFinder

pmi = nltk.collocations.TrigramAssocMeasures().pmi
finder = TrigramCollocationFinder.from_words(words)
importantTrigrams = finder.nbest(pmi,30)
importantTrigrams

[('cameos', 'corals', 'shells'),
 ('madame', 'de', 'genlis'),
 ('medals', 'cameos', 'corals'),
 ('pastures', 'spreading', 'flocks'),
 ('de', 'genlis', 'adelaide'),
 ('proportions', 'fore', 'shortening'),
 ('spreading', 'flocks', 'orchard'),
 ('touches', 'malt', 'liquor'),
 ('unmarked', 'wavering', 'dubious'),
 ('commonplace', 'threadbare', 'stale'),
 ('jeffereys', 'clara', 'partridge'),
 ('listlessness', 'weariness', 'stupidity'),
 ('disguise', 'equivocation', 'mystery'),
 ('thick', 'leather', 'gaiters'),
 ('dated', 'sept', 'th'),
 ('doth', 'affliction', 'denote'),
 ('woollen', 'draper', 'linen'),
 ('hoping', 'fearing', 'adoring'),
 ('unequal', 'inconsistent', 'incongruous'),
 ('comments', 'undoubting', 'decision'),
 ('gossips', 'tiresome', 'wretches'),
 ('clearest', 'headed', 'longest'),
 ('serle', 'understands', 'boiling'),
 ('hedges', 'gates', 'pools'),
 ('lengths', 'pencil', 'crayon'),
 ('north', 'east', 'wind'),
 ('playing', 'robin', 'adair'),
 ('steaks', 'nicely', 'fried'),
 ('se

In [None]:
trigrams = list(nltk.trigrams(words))
frequencies = nltk.FreqDist(trigrams)
minFreq = 10

filterNgrams(frequencies, minFreq)

## 2. CountVectorizer

In [None]:
counter = CountVectorizer(
    preprocessor=preprocessDocument,
    stop_words=stopWords,
    tokenizer=tokenizeDocument,
    ngram_range=(1,2)
  )

bagOfWordsMatrix = counter.fit_transform([book])
bagOfWordsMatrix

<1x59762 sparse matrix of type '<class 'numpy.int64'>'
	with 59762 stored elements in Compressed Sparse Row format>

In [None]:
vocabulary = counter.get_feature_names_out()
print(vocabulary)

['`' '` aimable' '` augusta' ... 'zeal pursuing' 'zigzags'
 'zigzags embarrassment']


In [None]:
bagOfWords = pd.DataFrame( data=bagOfWordsMatrix.toarray() , columns=vocabulary )
bagOfWords = bagOfWords.sort_values(by=0, axis=1, ascending=False)
bagOfWords

Unnamed: 0,mr,emma,could,would,mrs,miss,must,harriet,much,said,...,goddard till,goddard twelvemonth,goddard unavoidable,goddard voices,goddard want,goddard way,goddard would,goddard written,goes end,zigzags embarrassment
0,1154,865,837,821,701,602,571,506,486,484,...,1,1,1,1,1,1,1,1,1,1
