## CSC 583 Ngrams -- Example Scratch Code

In [1]:
import nltk

In [2]:
s1 = "i like sam"
s2 = "sam is great"
s3 = "sam is funny"
s4 = "i like funny people"
s5 = "i know many funny people like sam"

In [3]:
sentences = [s.split() for s in [s1, s2, s3, s4, s5]]
print (sentences)

[['i', 'like', 'sam'], ['sam', 'is', 'great'], ['sam', 'is', 'funny'], ['i', 'like', 'funny', 'people'], ['i', 'know', 'many', 'funny', 'people', 'like', 'sam']]


In [21]:
# flatten the sentences (a nested list) to a flat list
def flatten(sents):
  # assuming the nesting level of 2..
  return [token for sent in sents for token in sent]

newlist = flatten(sentences)
print (newlist)

['i', 'like', 'sam', 'sam', 'is', 'great', 'sam', 'is', 'funny', 'i', 'like', 'funny', 'people', 'i', 'know', 'many', 'funny', 'people', 'like', 'sam']


### NLTK FreqDist
NLTK **FreqDist** is a **dictionary** where, given a/one (flattened) list of items,
it stores the frequency counts of the items (item, its-frequency) in the list, where **item is the key and its-frequency is the value**.

(1) Uningrams

In [9]:
fdist1 = nltk.probability.FreqDist(newlist)
print (fdist1)

<FreqDist with 9 samples and 20 outcomes>


In [45]:
# you can use the same Python syntax to access things in FreqDist
print (fdist1['i'])
print (fdist1['sam'])
print (fdist1['people'])
print (fdist1['noriko'])
# show all keys and values
print (fdist1.items())
print (f"The total frequency is {sum(fdist1.values())}")

3
4
2
0
dict_items([('i', 3), ('like', 3), ('sam', 4), ('is', 2), ('great', 1), ('funny', 3), ('people', 2), ('know', 1), ('many', 1)])
The total frequency is 20


#### (!) Convert frequencies to **probabilities** using .freq() -- for unigrams

In [47]:
print (fdist1.freq('i'))      # should be 3/20 = 0.15
print (fdist1.freq('sam'))    # should be 4/20 = 0.2
print (fdist1.freq('people')) # should be 2/20 = 0.1
print (fdist1.freq('noriko')) # should be 0/20 = 0.0

0.15
0.2
0.1
0.0


## NLTK ngrams()
This NLTK **function**, given a list of items/tokens, returns a list of ngrams obtained by a sliding window of n items in the list.

(2) Bigrams

In [22]:
from nltk.util import ngrams

# bigrams
bigram_list = [list(ngrams(sentence, 2, pad_left=True, left_pad_symbol='<START>')) for sentence in sentences]
print (bigram_list)

[[('<START>', 'i'), ('i', 'like'), ('like', 'sam')], [('<START>', 'sam'), ('sam', 'is'), ('is', 'great')], [('<START>', 'sam'), ('sam', 'is'), ('is', 'funny')], [('<START>', 'i'), ('i', 'like'), ('like', 'funny'), ('funny', 'people')], [('<START>', 'i'), ('i', 'know'), ('know', 'many'), ('many', 'funny'), ('funny', 'people'), ('people', 'like'), ('like', 'sam')]]


## **NLTK ConditionalFreqDist**
NLTK **ConditionalFreqDist** is essentially a **dictionary** too where, given a list of 2-tuples \[(key1, value1), (key2, value2),...\], for each 'key', it stores the frequency counts of the 'value's in a NLTK FreqDist dictionary.

In [24]:
cfd = nltk.ConditionalFreqDist(flatten(bigram_list))
print (cfd)

<ConditionalFreqDist with 9 conditions>


### The FreqDist as the value for the condition <START> in bigrams

In [28]:
fd_start = cfd['<START>'] # value for the key '<START>' is a
print (fd_start)          # FreqDist of words that followed '<START>'

<FreqDist with 2 samples and 5 outcomes>


In [48]:
# examine the frequency count of the tokens that followed '<START>'
print (fd_start['i'])      # frequency of ('START>', 'i') in bigram_list
print (fd_start['sam'])    # frequency of ('START>', 'sam') in bigram_list
print (fd_start['people']) # frequency of ('START>', 'people') in bigram_list

# show all keys and values
print (fd_start.items())
print (f"The total frequency is {sum(fd_start.values())}")

3
2
0
dict_items([('i', 3), ('sam', 2)])
The total frequency is 5


In [49]:
## alternative syntax, starting from cfd
print (cfd['<START>']['i'])
print (cfd['<START>']['sam'])
print (cfd['<START>']['people'])

# show all keys and values
print (cfd['<START>'].items())
print (f"The total frequency is {sum(cfd['<START>'].values())}")


3
2
0
dict_items([('i', 3), ('sam', 2)])
The total frequency is 5


#### (!!) Convert frequencies to probabilities -- Ngrams (N > 1)

In [50]:
# let's convert frequencies to probabilities
print (fd_start.freq('i'))      # or cfd['<START>'].freq('i')
print (fd_start.freq('sam'))    # or cfd['<START>'].freq('sam')
print (fd_start.freq('people')) # or cfd['<START>'].freq('people')

0.6
0.4
0.0


In [51]:
# verify the above, by accessing from cfd directly (just a different syntax)
print (cfd['<START>'].freq('i'))
print (cfd['<START>'].freq('sam'))
print (cfd['<START>'].freq('people'))

0.6
0.4
0.0


### (3) Trigrams

In [23]:
# trigrams
trigram_list = [list(ngrams(sentence, 3, pad_left=True, left_pad_symbol='<START>')) for sentence in sentences]
print (trigram_list)

[[('<START>', '<START>', 'i'), ('<START>', 'i', 'like'), ('i', 'like', 'sam')], [('<START>', '<START>', 'sam'), ('<START>', 'sam', 'is'), ('sam', 'is', 'great')], [('<START>', '<START>', 'sam'), ('<START>', 'sam', 'is'), ('sam', 'is', 'funny')], [('<START>', '<START>', 'i'), ('<START>', 'i', 'like'), ('i', 'like', 'funny'), ('like', 'funny', 'people')], [('<START>', '<START>', 'i'), ('<START>', 'i', 'know'), ('i', 'know', 'many'), ('know', 'many', 'funny'), ('many', 'funny', 'people'), ('funny', 'people', 'like'), ('people', 'like', 'sam')]]


Conditional frequency dictinary

In [53]:
# first flatten trigram_list, then create a list of 2-tuples (context, prediction)
trigrams = flatten(trigram_list)
print (trigrams)

# tupled trigrams
tupled3 = [(tri[:-1], tri[-1]) for tri in trigrams]
print (tupled3)

# create a cfd from the list of tuples
cfd3 = nltk.ConditionalFreqDist(tupled3)
print (cfd3)

[('<START>', '<START>', 'i'), ('<START>', 'i', 'like'), ('i', 'like', 'sam'), ('<START>', '<START>', 'sam'), ('<START>', 'sam', 'is'), ('sam', 'is', 'great'), ('<START>', '<START>', 'sam'), ('<START>', 'sam', 'is'), ('sam', 'is', 'funny'), ('<START>', '<START>', 'i'), ('<START>', 'i', 'like'), ('i', 'like', 'funny'), ('like', 'funny', 'people'), ('<START>', '<START>', 'i'), ('<START>', 'i', 'know'), ('i', 'know', 'many'), ('know', 'many', 'funny'), ('many', 'funny', 'people'), ('funny', 'people', 'like'), ('people', 'like', 'sam')]
[(('<START>', '<START>'), 'i'), (('<START>', 'i'), 'like'), (('i', 'like'), 'sam'), (('<START>', '<START>'), 'sam'), (('<START>', 'sam'), 'is'), (('sam', 'is'), 'great'), (('<START>', '<START>'), 'sam'), (('<START>', 'sam'), 'is'), (('sam', 'is'), 'funny'), (('<START>', '<START>'), 'i'), (('<START>', 'i'), 'like'), (('i', 'like'), 'funny'), (('like', 'funny'), 'people'), (('<START>', '<START>'), 'i'), (('<START>', 'i'), 'know'), (('i', 'know'), 'many'), (('k

In [54]:
# show all keys and values
print (cfd3[('<START>', 'i')].items())
print (f"The total frequency is {sum(cfd3[('<START>', 'i')].values())}")


dict_items([('like', 2), ('know', 1)])
The total frequency is 3


Convert frequencies to probabilities

In [56]:
print (cfd3[('<START>', 'i')].freq('like'))
print (cfd3[('<START>', 'i')].freq('know'))
print (cfd3[('<START>', 'i')].freq('people'))

0.6666666666666666
0.3333333333333333
0.0
