<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Parsing structure in text</H1></u></center>

In [None]:
# A parser processes an input string by using a set of grammatical rules
# and builds one or more rules that construct a grammar concept.
# Grammar is a declarative specification of a well-formed sentence and
# finally a parser is a procedural interpretation of grammar.

## CFG: Context-free-grammar

In [3]:
#toy CFG
from nltk import CFG
import nltk

In [9]:
toy_grammar = nltk.CFG.fromstring(
"""
    S -> NP VP  
    VP -> V NP  
    V -> "eats"| "drinks" 
    NP -> Det N
    Det -> "a" | "an" | "the" 
    N -> "president" |"Trump"|"orange"| "wine" 
""")

In [10]:
# S indicate the entire sentence
# VP is verb phrase the
# V is verb
# NP is noun phrase (chunk that has noun in it)
# Det is determiner used in the sentences
# N are nouns

In [11]:
toy_grammar.productions()

[S -> NP VP,
 VP -> V NP,
 V -> 'eats',
 V -> 'drinks',
 NP -> Det N,
 Det -> 'a',
 Det -> 'an',
 Det -> 'the',
 N -> 'president',
 N -> 'Trump',
 N -> 'orange',
 N -> 'wine']

In [12]:
#Examples: 
# president eats orange
# Trump drinks wine
# The same grammar can construct meaningless sentences such as:
# orange eats wine or president drinks Trump

## Regex parser

In [14]:
from nltk.chunk.regexp import *
chunk_rules = ChunkRule("<.*>+","chunk everthing")

In [15]:
reg_parser = RegexpParser('''
NP : {<DT>? <JJ>* <NN>*}
P : {<IN>}
V : {<V.*>}
PP : {<P> <NP>}
VP : {<V> <NP|PP>*}
''')

In [16]:
# NP
# Preposition
# Verb
# PP -> P NP
# VP -> V {NP|PP}*

In [17]:
test_ = "Mr. Carl Sagan played a big role in the science disclosure"
test_pos = nltk.pos_tag(nltk.word_tokenize(test_))

In [18]:
_out = reg_parser.parse(test_pos)

In [19]:
print(_out)

(S
  Mr./NNP
  Carl/NNP
  Sagan/NNP
  (VP
    (V played/VBD)
    (NP a/DT big/JJ role/NN)
    (PP (P in/IN) (NP the/DT science/NN disclosure/NN))))


## Dependency parsing

In [20]:
#The concept is that each word is connected with each other by a directed link.

In [21]:
#Stanford Parser
from nltk.parse.stanford import StanfordParser

In [37]:
jar = '../Resources/stanford-parser-full/stanford-parser.jar'
model = '../Resources/stanford-parser-full/stanford-parser-3.8.0-models.jar'
parser = StanfordParser(model, jar)
output = parser.raw_parse_sents(("This is english parse test").split())

In [38]:
print([list(i)[0] for i in output])

[Tree('ROOT', [Tree('FRAG', [Tree('NP', [Tree('DT', ['This'])])])]), Tree('ROOT', [Tree('FRAG', [Tree('VP', [Tree('VBZ', ['is'])])])]), Tree('ROOT', [Tree('S', [Tree('VP', [Tree('VB', ['english'])])])]), Tree('ROOT', [Tree('NP', [Tree('NN', ['parse'])])]), Tree('ROOT', [Tree('NP', [Tree('NN', ['test'])])])]


## Reference:

https://nlp.stanford.edu/software/lex-parser.shtml