## 1. Import the libraries

In [1]:
from nltk.tokenize import word_tokenize
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk

# Downloads the NLTK package
nltk.download('stopwords')
nltk.download('punkt')

corpus = [
    'Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.',
    "Apple is an American multinational technology company headquartered in Cupertino, California.",
    "Pollock also served as Chairman and CEO of the Toronto Blue Jays baseball club.",
]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leticiachoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leticiachoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 2. Some idea of POS tagging and CFG

Sources: [nltk (CFG)](https://www.nltk.org/book/ch08.html)

Some ideas of the POS tags,

| Tag | Description |
| -- | -- |
| PRP | Personal pronoun |
| PRP$ |  |
| VBP |	Verb, non-3rd person singular present |
| DT | Determiner | 
| NN | Noun, singular or mass |

For more information, [read here (as it uses the Penn data bank)](https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html)

In [2]:
sent = word_tokenize('I shot an elephant in my pajamas')

# Display the POS tagging.
print(pd.DataFrame(nltk.pos_tag(sent), columns=['Word', 'Tag']))

# Using the current produced rule to produce the tree.
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")

print('\n\nTree:\n')

for tree in nltk.ChartParser(grammar).parse(sent):
    print(tree)

       Word   Tag
0         I   PRP
1      shot   VBP
2        an    DT
3  elephant    NN
4        in    IN
5        my  PRP$
6   pajamas    NN


Tree:

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


## 3. Using the CoreNLP (Stanford) Parser to determine structure

Credits: [Benjamin Bengfort](https://bbengfort.github.io/2018/06/corenlp-nltk-parses/)

In [3]:
from nltk.internals import find_jars_within_path
from nltk.parse.corenlp import CoreNLPServer, CoreNLPParser
import os

# Modify to own stanford ner. 
models_dir_path = "stanford-corenlp-4.2.2"

# Create the server
server = CoreNLPServer(
   os.path.join(models_dir_path, "stanford-corenlp-4.2.2.jar"),
   os.path.join(models_dir_path, "stanford-corenlp-4.2.2-models.jar")
)

# Start the server in the background
server.start()

In [4]:
from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser()

for sent in corpus:
    print(f'Sentence: \n"{sent}"\n')
    print('POS tag: ', pd.DataFrame(nltk.pos_tag(word_tokenize(sent))), '\n')
    print('Tree:\n')

    for tree in parser.raw_parse(sent):
        print(tree)

    print('\n\n')

Sentence: 
"Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal."

POS tag:               0    1
0    Christine  NNP
1      Lagarde  NNP
2    discussed  VBD
3   short-term   JJ
4     stimulus   NN
5      efforts  NNS
6           in   IN
7            a   DT
8       recent   JJ
9    interview   NN
10        with   IN
11         the   DT
12        Wall  NNP
13      Street  NNP
14     Journal  NNP
15           .    . 

Tree:

(ROOT
  (S
    (NP (NNP Christine) (NNP Lagarde))
    (VP
      (VBD discussed)
      (NP
        (NML (JJ short) (HYPH -) (NN term))
        (NN stimulus)
        (NNS efforts))
      (PP (IN in) (NP (DT a) (JJ recent) (NN interview)))
      (PP
        (IN with)
        (NP (DT the) (NML (NNP Wall) (NNP Street)) (NNP Journal))))
    (. .)))



Sentence: 
"Apple is an American multinational technology company headquartered in Cupertino, California."

POS tag:                  0    1
0           Apple  NNP
1      

In [5]:
server.stop()