<a href="https://colab.research.google.com/github/mithunkumarsr/NLPNov21/blob/main/S6_PCFG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import string

In [None]:

from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
text = "I am studying NLP"
pos_tag(word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('I', 'PRP'), ('am', 'VBP'), ('studying', 'VBG'), ('NLP', 'NNP')]

In [None]:
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    NP -> Det NP | Adj NP | Adj | N | Adv NP
    VP -> V NP  
    Det -> "a" | "an" | "the" 
    V -> "is"
    Adj -> "interesting" | "very"
    N -> "NLP" | "subject"
    Adv -> "very"
""")

statement = "NLP is very interesting"
sentence = word_tokenize(statement)
print(sentence)
print(nltk.pos_tag(sentence))

['NLP', 'is', 'very', 'interesting']
[('NLP', 'NNP'), ('is', 'VBZ'), ('very', 'RB'), ('interesting', 'JJ')]


In [None]:
# Recursive descent parser is a kind of top-down parser 
# built from a set of mutually recursive procedures 
# where each such procedure implements one of the nonterminals of the grammar. 
rd_parser = nltk.RecursiveDescentParser(grammar)
total_trees = 0
for tree in rd_parser.parse(sentence):
    total_trees = total_trees+1
    print(tree)
    #tree.draw()

(S (NP (N NLP)) (VP (V is) (NP (Adj very) (NP (Adj interesting)))))
(S (NP (N NLP)) (VP (V is) (NP (Adv very) (NP (Adj interesting)))))


In [None]:
if total_trees > 1 :
    print("Ambiguious grammar")
else:
    print("Unambiguious grammar")    

Ambiguious grammar


 Chart Parser

In [None]:
# When a chart parser begins parsing a text, it creates a new (empty) chart, spanning the text. 
# It then incrementally adds new edges to the chart.  
# A set of "chart rules" specifies the conditions under which new edges should be added to the chart.
# Once the chart reaches a stage where none of the chart rules adds any new edges, parsing is complete.
grammar1 = nltk.CFG.fromstring("""
    S -> NP VP 
    VP -> V NP | Aux VP | V
    NP -> Det NP | N | Adj NP | Adj
    N -> "girl" | "boy"
    Det -> "The"
    Aux -> "is"
    V -> "laughing" | "playing" 
    Adj -> "laughing" | "well"
""")
statement = nltk.word_tokenize("The girl is laughing")
print(nltk.pos_tag(statement))

[('The', 'DT'), ('girl', 'NN'), ('is', 'VBZ'), ('laughing', 'VBG')]


In [None]:
total_trees = 0
rd_parser = nltk.ChartParser(grammar1)
for tree in rd_parser.parse(statement):
    total_trees = total_trees + 1
    print(tree)
    #tree.draw()

(S (NP (Det The) (NP (N girl))) (VP (Aux is) (VP (V laughing))))


In [None]:
if total_trees > 1 :
    print("Ambiguious grammar")
else:
    print("Unambiguious grammar")

Unambiguious grammar


In [None]:
grammar1 = nltk.CFG.fromstring("""
    S -> NP VP 
    VP -> V NP | Aux VP | V NP PP | V
    PP -> P NP
    NP -> Det NP | N | Adj NP | Adj | Det N PP
    N -> "girl" | "boy" | "Omkar" | "can" | "hold" | "water"
    Det -> "The" | "a" | "the"
    Aux -> "is" | "can"
    P -> "of" | "with"
    V -> "laughing" | "playing" | "can" | "hold" | "water"
    Adj -> "laughing" | "well"
""")
statement = nltk.word_tokenize("The can can hold a can of water")
print(nltk.pos_tag(statement))

[('The', 'DT'), ('can', 'MD'), ('can', 'MD'), ('hold', 'VB'), ('a', 'DT'), ('can', 'MD'), ('of', 'IN'), ('water', 'NN')]


In [None]:
tree_count = 0
chart_parser = nltk.ChartParser(grammar1)
for tree in chart_parser.parse(statement):
    tree_count = tree_count+1
    print(tree)
    #tree.draw()

(S
  (NP (Det The) (NP (N can)))
  (VP
    (Aux can)
    (VP
      (V hold)
      (NP (Det a) (NP (N can)))
      (PP (P of) (NP (N water))))))
(S
  (NP (Det The) (NP (N can)))
  (VP
    (Aux can)
    (VP (V hold) (NP (Det a) (N can) (PP (P of) (NP (N water)))))))


In [None]:
if tree_count > 1 :
    print("Ambiguos Sentence")
else :
    print("Unambiguos Sentence")

Ambiguos Sentence


Probabilitic Context Free Grammar (PCFG)

A PCFG consists of a start state and a set of productions with probabilities. The set of terminals and nonterminals is implicitly specified by the productions.

In [None]:
from nltk.parse import pchart
from nltk.grammar import toy_pcfg2
grammar = toy_pcfg2
sent = "Jack saw Bob with the telescope"
parser = pchart.InsideChartParser(grammar)
times=[]
print('\n sentence: %s\n parser: %s\n grammar_rules: %s' % (sent,parser,grammar))
parser.trace(1)


 sentence: Jack saw Bob with the telescope
 parser: <nltk.parse.pchart.InsideChartParser object at 0x7fa131eb5790>
 grammar_rules: Grammar with 23 productions (start state = S)
    S -> NP VP [1.0]
    VP -> V NP [0.59]
    VP -> V [0.4]
    VP -> VP PP [0.01]
    NP -> Det N [0.41]
    NP -> Name [0.28]
    NP -> NP PP [0.31]
    PP -> P NP [1.0]
    V -> 'saw' [0.21]
    V -> 'ate' [0.51]
    V -> 'ran' [0.28]
    N -> 'boy' [0.11]
    N -> 'cookie' [0.12]
    N -> 'table' [0.13]
    N -> 'telescope' [0.14]
    N -> 'hill' [0.5]
    Name -> 'Jack' [0.52]
    Name -> 'Bob' [0.48]
    P -> 'with' [0.61]
    P -> 'under' [0.39]
    Det -> 'the' [0.41]
    Det -> 'a' [0.31]
    Det -> 'my' [0.28]


In [None]:
import time
t = time.time()
tokens = word_tokenize(sent)
parses = parser.parse_all(tokens)
times.append(time.time()-t)
print("the time required by the Inside Chart parser  %s "%(times))

  |. . . . . [-]| [5:6] 'telescope'                  [1.0]
  |. . . . [-] .| [4:5] 'the'                        [1.0]
  |. . . [-] . .| [3:4] 'with'                       [1.0]
  |. . [-] . . .| [2:3] 'Bob'                        [1.0]
  |. [-] . . . .| [1:2] 'saw'                        [1.0]
  |[-] . . . . .| [0:1] 'Jack'                       [1.0]
  |. . . [-] . .| [3:4] P  -> 'with' *               [0.61]
  |. . . > . . .| [3:3] PP -> * P NP                 [1.0]
  |. . . [-> . .| [3:4] PP -> P * NP                 [0.61]
  |. . . > . . .| [3:3] P  -> * 'with'               [0.61]
  |[-] . . . . .| [0:1] Name -> 'Jack' *             [0.52]
  |> . . . . . .| [0:0] Name -> * 'Jack'             [0.52]
  |. . [-] . . .| [2:3] Name -> 'Bob' *              [0.48]
  |. . > . . . .| [2:2] Name -> * 'Bob'              [0.48]
  |. . . . [-] .| [4:5] Det -> 'the' *               [0.41]
  |. . . . > . .| [4:4] NP -> * Det N                [0.41]
  |. . . . > . .| [4:4] Det -> * 'the'         

In [None]:
import sys, time
from nltk import tokenize
from nltk.grammar import toy_pcfg1
from nltk.parse import pchart
from nltk.parse import ViterbiParser

demos = [('I saw John with my telescope', toy_pcfg1)]
sent, grammar = demos[0]

# Tokenize the sentence.
tokens = sent.split()

# Define a list of parsers.  We'll use all parsers.
parsers = [
ViterbiParser(grammar),
pchart.InsideChartParser(grammar),
pchart.RandomChartParser(grammar),
pchart.UnsortedChartParser(grammar),
pchart.LongestChartParser(grammar),
pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
]

In [None]:
# Run the parsers on the tokenized sentence.
from functools import reduce
times = []
average_p = []
num_parses = []
all_parses = {}
for parser in parsers:
    print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    times.append(time.time()-t)
    if parses: 
        lp = len(parses)
        p = reduce(lambda a,b:a+b.prob(), parses, 0.0)
    else: 
        p = 0
    average_p.append(p)
    num_parses.append(len(parses))
    for p in parses: 
        all_parses[p.freeze()] = 1

# Print summary statistics
print()
print('-------------------------+------------------------------------------')
print('   Parser           Beam | Time (secs)   # Parses   Average P(parse)')
print('-------------------------+------------------------------------------')
for i in range(len(parsers)):
    print('%19s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
      getattr(parsers[0], "beam_size", 0),
      times[i], 
      num_parses[i], 
      average_p[i]))
parses = all_parses.keys()
if parses: 
    p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
else: 
    p = 0
print('-------------------------+------------------------------------------')
print('%19s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
print()

for parse in parses:
    print(parse)


s: I saw John with my telescope
parser: <ViterbiParser for <Grammar with 17 productions>>
grammar: Grammar with 17 productions (start state = S)
    S -> NP VP [1.0]
    NP -> Det N [0.5]
    NP -> NP PP [0.25]
    NP -> 'John' [0.1]
    NP -> 'I' [0.15]
    Det -> 'the' [0.8]
    Det -> 'my' [0.2]
    N -> 'man' [0.5]
    N -> 'telescope' [0.5]
    VP -> VP PP [0.1]
    VP -> V NP [0.7]
    VP -> V [0.2]
    V -> 'ate' [0.35]
    V -> 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61]
    P -> 'under' [0.39]
Inserting tokens into the most likely constituents table...
   Insert: |=.....| I
   Insert: |.=....| saw
   Insert: |..=...| John
   Insert: |...=..| with
   Insert: |....=.| my
   Insert: |.....=| telescope
Finding the most likely constituents spanning 1 text elements...
   Insert: |=.....| NP -> 'I' [0.15]                0.1500000000 
   Insert: |.=....| V -> 'saw' [0.65]               0.6500000000 
   Insert: |.=....| VP -> V [0.2]                   0.1300000000 
   Ins