In [1]:
import nltk
from nltk.util import ngrams
from nltk import pos_tag, word_tokenize, RegexpParser, Tree
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
# nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [3]:
grammar = r"""
  NP: {<DT>?<JJ>*<NN|NNS|NNP|NNPS><NN|NNS|NNP|NNPS>*}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """

chunker = RegexpParser(grammar)

In [4]:
def chunk(corpus):
    tagged = pos_tag(word_tokenize(corpus))
    output = chunker.parse(tagged)
    return str(output)

In [5]:
# output.draw()

In [6]:
tokenizer = PunktSentenceTokenizer()

def tag_pos(corpus):

    tokenized = tokenizer.tokenize(corpus)

    try:
        for sent in tokenized:
            words = nltk.word_tokenize(sent)
            tagged = nltk.pos_tag(words)
            return tagged
    except Exception as e:
        return str(e)

In [7]:
def tag_and_chunk(corpus):
    print(chunk(corpus))
    print("\n")
    print(tag_pos(corpus))

# Noun Phrase

Linguistics for English Language Teaching: Sounds, Words, and Sentences 

In [8]:
noun_phrases = [ "John",
 "mailmen",
 "most students", 
 "many Americans",
 "a huge, loveable bear",
 "a student from brazil",
 "the table in the corner",
 "the people we interviewed",
"John and his friends"]

for np in noun_phrases:
    np = np.capitalize()
    np += " laughed at the cat."
    print(tag_and_chunk(np))
    print("\n")


(S (NP John/NNP) laughed/VBD (PP at/IN (NP the/DT cat/NN)) ./.)


[('John', 'NNP'), ('laughed', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('.', '.')]
None


(S (NP Mailmen/NNS) laughed/VBD (PP at/IN (NP the/DT cat/NN)) ./.)


[('Mailmen', 'NNS'), ('laughed', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('.', '.')]
None


(S
  Most/JJS
  (NP students/NNS)
  laughed/VBN
  (PP at/IN (NP the/DT cat/NN))
  ./.)


[('Most', 'JJS'), ('students', 'NNS'), ('laughed', 'VBN'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('.', '.')]
None


(S
  (NP Many/JJ americans/NNS)
  laughed/VBN
  (PP at/IN (NP the/DT cat/NN))
  ./.)


[('Many', 'JJ'), ('americans', 'NNS'), ('laughed', 'VBN'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN'), ('.', '.')]
None


(S
  A/DT
  huge/JJ
  ,/,
  (NP loveable/JJ bear/NN)
  laughed/VBN
  (PP at/IN (NP the/DT cat/NN))
  ./.)


[('A', 'DT'), ('huge', 'JJ'), (',', ','), ('loveable', 'JJ'), ('bear', 'NN'), ('laughed', 'VBN'), ('at', 'IN'), ('the', 'DT'), ('cat

Many and most are considred as adverbs, but they can also be quantifiers.
His can be pronoun and determiner, here it is treated as pronoun.

English Syntax: An Introduction

In [9]:
sentences = [ "His friend learned dancing.",
 "My bother’s friend learned dancing.",
 "The president’s bodyguard learned surveillance.", 
 "The King of Rock and Roll’s records led to dancing."]

for s in sentences:
    print(tag_and_chunk(s))
    print("\n")

(S His/PRP$ (NP friend/NN) learned/VBD (NP dancing/NN) ./.)


[('His', 'PRP$'), ('friend', 'NN'), ('learned', 'VBD'), ('dancing', 'NN'), ('.', '.')]
None


(S
  My/PRP$
  (NP bother/NN ’/NNP s/NN friend/NN)
  learned/VBD
  (NP dancing/NN)
  ./.)


[('My', 'PRP$'), ('bother', 'NN'), ('’', 'NNP'), ('s', 'NN'), ('friend', 'NN'), ('learned', 'VBD'), ('dancing', 'NN'), ('.', '.')]
None


(S
  (NP The/DT president/NN ’/NNP)
  s/VBZ
  bodyguard/RB
  learned/VBN
  (NP surveillance/NN)
  ./.)


[('The', 'DT'), ('president', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('bodyguard', 'RB'), ('learned', 'VBN'), ('surveillance', 'NN'), ('.', '.')]
None


(S
  (NP The/DT King/NNP)
  (PP of/IN (NP Rock/NNP))
  and/CC
  (NP Roll/NNP ’/NNP s/NN records/NNS)
  led/VBD
  to/TO
  dancing/VBG
  ./.)


[('The', 'DT'), ('King', 'NNP'), ('of', 'IN'), ('Rock', 'NNP'), ('and', 'CC'), ('Roll', 'NNP'), ('’', 'NNP'), ('s', 'NN'), ('records', 'NNS'), ('led', 'VBD'), ('to', 'TO'), ('dancing', 'VBG'), ('.', '.')]
None




In [11]:
compund_noun = "I am at the bus stop."
tag_and_chunk(compund_noun)

(S I/PRP am/VBP (PP at/IN (NP the/DT bus/NN stop/NN)) ./.)


[('I', 'PRP'), ('am', 'VBP'), ('at', 'IN'), ('the', 'DT'), ('bus', 'NN'), ('stop', 'NN'), ('.', '.')]


## Possesive Determiner

Possesive determiner should be seperated from possesive pronoun. His, hers, mine is not followed by another noun, but his, her and my are.

In [25]:
pos_dets = ["my", "our", "your", "his", "her", "their", "its"]

for pos_dt in pos_dets:
    sent = pos_dt + " learned dancing"
    sent = sent.capitalize()
    tag_and_chunk(sent)

(S My/PRP$ learned/VBD dancing/VBG)


[('My', 'PRP$'), ('learned', 'VBD'), ('dancing', 'VBG')]
(S Our/PRP$ (VP learned/VBD (NP dancing/NN)))


[('Our', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]
(S Your/PRP$ (VP learned/VBD (NP dancing/NN)))


[('Your', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]
(S His/PRP$ (VP learned/VBD (NP dancing/NN)))


[('His', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]
(S Her/PRP$ (VP learned/VBD (NP dancing/NN)))


[('Her', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]
(S Their/PRP$ (VP learned/VBD (NP dancing/NN)))


[('Their', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]
(S Its/PRP$ (VP learned/VBD (NP dancing/NN)))


[('Its', 'PRP$'), ('learned', 'VBD'), ('dancing', 'NN')]


In every instance, considering possesive determiners as pronouns causes chunking issues in noun phrase.

In [26]:
tag_and_chunk("Cat friend learned dancing.")

(S (NP Cat/NNP friend/NN) learned/VBD (NP dancing/NN) ./.)


[('Cat', 'NNP'), ('friend', 'NN'), ('learned', 'VBD'), ('dancing', 'NN'), ('.', '.')]


Because the grammar set earlier already includes compound noun, converting the possesive determiners to a random noun fixes the noun phrase chunking issue in possesive determiner + noun.

## Demonstrative determiners

In [28]:
s_dem_dets = ["this", "that"]

for s_dem in s_dem_dets:
    sent = s_dem + " cat can swim."
    sent = sent.capitalize()
    tag_and_chunk(sent)

(S (NP This/DT cat/NN) can/MD swim/VB ./.)


[('This', 'DT'), ('cat', 'NN'), ('can', 'MD'), ('swim', 'VB'), ('.', '.')]
(S (NP That/DT cat/NN) can/MD swim/VB ./.)


[('That', 'DT'), ('cat', 'NN'), ('can', 'MD'), ('swim', 'VB'), ('.', '.')]


In [31]:
p_dem_dets = ["these", "those"]

for p_dem in p_dem_dets:
    sent = p_dem + " cats can swim."
    sent = sent.capitalize()
    tag_and_chunk(sent)

(S (NP These/DT cats/NNS) can/MD swim/VB ./.)


[('These', 'DT'), ('cats', 'NNS'), ('can', 'MD'), ('swim', 'VB'), ('.', '.')]
(S (NP Those/DT cats/NNS) can/MD swim/VB ./.)


[('Those', 'DT'), ('cats', 'NNS'), ('can', 'MD'), ('swim', 'VB'), ('.', '.')]


No issue for demonstrative determiners.

## Articles (Can be considered as determiners)

In [32]:
art_nouns = ["the cat", "a cat", "an orange"]

for art_noun in art_nouns:
    sent = art_noun + " can fly."
    sent = sent.capitalize()
    tag_and_chunk(sent)

(S (NP The/DT cat/NN) can/MD fly/VB ./.)


[('The', 'DT'), ('cat', 'NN'), ('can', 'MD'), ('fly', 'VB'), ('.', '.')]
(S (NP A/DT cat/NN) can/MD fly/VB ./.)


[('A', 'DT'), ('cat', 'NN'), ('can', 'MD'), ('fly', 'VB'), ('.', '.')]
(S (NP An/DT orange/NN) can/MD fly/VB ./.)


[('An', 'DT'), ('orange', 'NN'), ('can', 'MD'), ('fly', 'VB'), ('.', '.')]


No issue for articles.

## Quantifiers (as determiners)

In [None]:
quantifiers = ["all", "some", "many", "lot", "lots", "ton", "tons", "bit", "no", "every", "enough", "little",
 "much", "more", "most", "plenty", "several", "few", "fewer" ]

In [13]:
tag_and_chunk("I love blue.")
tag_and_chunk("The blue car is gone.")

(S I/PRP love/VBP (NP blue/NN) ./.)


[('I', 'PRP'), ('love', 'VBP'), ('blue', 'NN'), ('.', '.')]
(S (NP The/DT blue/NN car/NN) is/VBZ gone/VBN ./.)


[('The', 'DT'), ('blue', 'NN'), ('car', 'NN'), ('is', 'VBZ'), ('gone', 'VBN'), ('.', '.')]


It cannot distinguish between colour as a noun vs. as an adjective.

In [12]:
tag_and_chunk("I can help you.")
tag_and_chunk("The tin can is red.")
tag_and_chunk("The tin cans are red.")

(S I/PRP can/MD help/VB you/PRP ./.)


[('I', 'PRP'), ('can', 'MD'), ('help', 'VB'), ('you', 'PRP'), ('.', '.')]
(S (NP The/DT tin/NN) can/MD is/VBZ red/JJ ./.)


[('The', 'DT'), ('tin', 'NN'), ('can', 'MD'), ('is', 'VBZ'), ('red', 'JJ'), ('.', '.')]
(S (NP The/DT tin/NN cans/NNS) are/VBP red/JJ ./.)


[('The', 'DT'), ('tin', 'NN'), ('cans', 'NNS'), ('are', 'VBP'), ('red', 'JJ'), ('.', '.')]


It can detect cans, but not distinguish between can as a modal verb and can as a singular noun.

In [23]:
tag_and_chunk("Test your might.")
tag_and_chunk("Test your mights.")
tag_and_chunk("I might be late.")

(S (NP Test/NNP) your/PRP$ might/MD ./.)


[('Test', 'NNP'), ('your', 'PRP$'), ('might', 'MD'), ('.', '.')]
(S Test/VB your/PRP$ (NP mights/NNS) ./.)


[('Test', 'VB'), ('your', 'PRP$'), ('mights', 'NNS'), ('.', '.')]
(S I/PRP might/MD be/VB late/RB ./.)


[('I', 'PRP'), ('might', 'MD'), ('be', 'VB'), ('late', 'RB'), ('.', '.')]


# Coordinating Conjunction

https://grammar.yourdictionary.com/parts-of-speech/conjunctions/coordinating-conjunctions.html

In [15]:
tag_and_chunk("I go to the park every Sunday.")
tag_and_chunk("I long to see his face.")
for_conj = "I go to the park every Sunday, for I long to see his face."
tag_and_chunk(for_conj)

(S
  I/PRP
  go/VBP
  to/TO
  (NP the/DT park/NN)
  (NP every/DT Sunday/NNP)
  ./.)


[('I', 'PRP'), ('go', 'VBP'), ('to', 'TO'), ('the', 'DT'), ('park', 'NN'), ('every', 'DT'), ('Sunday', 'NNP'), ('.', '.')]
(S I/PRP long/RB to/TO see/VB his/PRP$ (NP face/NN) ./.)


[('I', 'PRP'), ('long', 'RB'), ('to', 'TO'), ('see', 'VB'), ('his', 'PRP$'), ('face', 'NN'), ('.', '.')]
(S
  I/PRP
  go/VBP
  to/TO
  (NP the/DT park/NN)
  (NP every/DT Sunday/NNP)
  ,/,
  for/IN
  I/PRP
  long/JJ
  to/TO
  see/VB
  his/PRP$
  (NP face/NN)
  ./.)


[('I', 'PRP'), ('go', 'VBP'), ('to', 'TO'), ('the', 'DT'), ('park', 'NN'), ('every', 'DT'), ('Sunday', 'NNP'), (',', ','), ('for', 'IN'), ('I', 'PRP'), ('long', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('his', 'PRP$'), ('face', 'NN'), ('.', '.')]


NLTK treats for as an adverb, but it can also be coordinating conjuction.

In [16]:
and_conj = "I like to read, and I write in my journal every night."
tag_and_chunk(and_conj)

(S
  I/PRP
  like/VBP
  to/TO
  read/VB
  ,/,
  and/CC
  I/PRP
  write/VBP
  in/IN
  my/PRP$
  (NP journal/NN)
  (NP every/DT night/NN)
  ./.)


[('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('read', 'VB'), (',', ','), ('and', 'CC'), ('I', 'PRP'), ('write', 'VBP'), ('in', 'IN'), ('my', 'PRP$'), ('journal', 'NN'), ('every', 'DT'), ('night', 'NN'), ('.', '.')]


In [17]:
and_conj = "You should invite Mario and Estefan to the party."
tag_and_chunk(and_conj)

(S
  You/PRP
  should/MD
  invite/VB
  (NP Mario/NNP)
  and/CC
  (NP Estefan/NNP)
  to/TO
  (NP the/DT party/NN)
  ./.)


[('You', 'PRP'), ('should', 'MD'), ('invite', 'VB'), ('Mario', 'NNP'), ('and', 'CC'), ('Estefan', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('party', 'NN'), ('.', '.')]


And is accurate.

In [18]:
nor_conj = "My sister doesn’t like to study, nor does she take notes in class."
tag_and_chunk(nor_conj)

(S
  My/PRP$
  (NP sister/NN doesn/NN ’/NNP t/NN)
  like/IN
  to/TO
  study/VB
  ,/,
  nor/CC
  does/VBZ
  she/PRP
  take/VB
  (NP notes/NNS)
  (PP in/IN (NP class/NN))
  ./.)


[('My', 'PRP$'), ('sister', 'NN'), ('doesn', 'NN'), ('’', 'NNP'), ('t', 'NN'), ('like', 'IN'), ('to', 'TO'), ('study', 'VB'), (',', ','), ('nor', 'CC'), ('does', 'VBZ'), ('she', 'PRP'), ('take', 'VB'), ('notes', 'NNS'), ('in', 'IN'), ('class', 'NN'), ('.', '.')]


Nor is accurate

In [19]:
but_conj = "Television is a wonderful escape, but it interferes with my writing."
tag_and_chunk(but_conj)

(S
  (NP Television/NN)
  is/VBZ
  (NP a/DT wonderful/JJ escape/NN)
  ,/,
  but/CC
  it/PRP
  interferes/VBZ
  with/IN
  my/PRP$
  (NP writing/NN)
  ./.)


[('Television', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('wonderful', 'JJ'), ('escape', 'NN'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('interferes', 'VBZ'), ('with', 'IN'), ('my', 'PRP$'), ('writing', 'NN'), ('.', '.')]


In [20]:
or_conj = "We could have dinner before the movie, or we could grab a bite afterward."
tag_and_chunk(or_conj)

(S
  We/PRP
  could/MD
  have/VB
  dinner/VBN
  (PP before/IN (NP the/DT movie/NN))
  ,/,
  or/CC
  we/PRP
  could/MD
  grab/VB
  (NP a/DT bite/JJ afterward/NN)
  ./.)


[('We', 'PRP'), ('could', 'MD'), ('have', 'VB'), ('dinner', 'VBN'), ('before', 'IN'), ('the', 'DT'), ('movie', 'NN'), (',', ','), ('or', 'CC'), ('we', 'PRP'), ('could', 'MD'), ('grab', 'VB'), ('a', 'DT'), ('bite', 'JJ'), ('afterward', 'NN'), ('.', '.')]


In [21]:
or_conj = "I can’t decide if I should study economics or political science."
tag_and_chunk(or_conj)

(S
  I/PRP
  can/MD
  ’/VB
  t/JJ
  decide/IN
  if/IN
  I/PRP
  should/MD
  study/VB
  (NP economics/NNS)
  or/CC
  (NP political/JJ science/NN)
  ./.)


[('I', 'PRP'), ('can', 'MD'), ('’', 'VB'), ('t', 'JJ'), ('decide', 'IN'), ('if', 'IN'), ('I', 'PRP'), ('should', 'MD'), ('study', 'VB'), ('economics', 'NNS'), ('or', 'CC'), ('political', 'JJ'), ('science', 'NN'), ('.', '.')]


In [22]:
yet_conj = "I always take a book to the beach, yet I never seem to turn a single page."
tag_and_chunk(yet_conj)

(S
  I/PRP
  always/RB
  take/VBP
  (NP a/DT book/NN)
  to/TO
  (NP the/DT beach/NN)
  ,/,
  yet/RB
  I/PRP
  never/RB
  seem/VBP
  to/TO
  turn/VB
  (NP a/DT single/JJ page/NN)
  ./.)


[('I', 'PRP'), ('always', 'RB'), ('take', 'VBP'), ('a', 'DT'), ('book', 'NN'), ('to', 'TO'), ('the', 'DT'), ('beach', 'NN'), (',', ','), ('yet', 'RB'), ('I', 'PRP'), ('never', 'RB'), ('seem', 'VBP'), ('to', 'TO'), ('turn', 'VB'), ('a', 'DT'), ('single', 'JJ'), ('page', 'NN'), ('.', '.')]
