In [3]:
import nltk
from nltk import RegexpParser

# Download the missing resource
nltk.download('averaged_perceptron_tagger')

# Examples
sentences = [
    "George Orwell was an influential British writer and journalist.",
    "Orwell served in the Indian Imperial Police in Burma.",
    "Orwell's literary career flourished in the 1940s.",
    "Orwell was a master of the essay form.",
    "Orwell wrote influential essays on a wide range of topics."
]

# Constituency Tests
constituency_results = {
    "i": ["an influential British writer and journalist"],
    "ii": ["George Orwell", "an influential British writer and journalist"],
    "iii": ["in the Indian Imperial Police in Burma"],
    "iv": ["Orwell", "in the Indian Imperial Police in Burma"],
    "v": ["in the 1940s"],
    "vi": ["Orwell's literary career", "in the 1940s"],
    "vii": ["a master of the essay form"],
    "viii": ["Orwell", "a master of the essay form"],
    "ix": ["influential essays", "on a wide range of topics"],
    "x": ["Orwell", "influential essays", "on a wide range of topics"]
}

# NLTK Constituency Parser
parser = RegexpParser('''
    NP: {<DT>?<JJ>*<NN.*>+}
    VP: {<VB.*><NP|PP|CLAUSE>+$}
    PP: {<IN><NP>}
    CLAUSE: {<NP><VP>}
''')

# Perform constituency parsing and compare with expected results
for key, sentence in zip(constituency_results.keys(), sentences):
    print(f"{key}. {sentence}")

    # Tokenize the sentence
    words = nltk.word_tokenize(sentence)

    # Part-of-speech tagging
    pos_tags = nltk.pos_tag(words)

    # Constituency parsing
    tree = parser.parse(pos_tags)

    # Print constituents
    print("Constituents:", [subtree.leaves() for subtree in tree.subtrees() if subtree.label() in ['NP', 'VP', 'PP', 'CLAUSE']])
    print("Expected:", constituency_results[key])
    print("\n")




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


i. George Orwell was an influential British writer and journalist.
Constituents: [[('George', 'NNP'), ('Orwell', 'NNP')], [('an', 'DT'), ('influential', 'JJ'), ('British', 'JJ'), ('writer', 'NN')], [('journalist', 'NN')]]
Expected: ['an influential British writer and journalist']


ii. Orwell served in the Indian Imperial Police in Burma.
Constituents: [[('Orwell', 'NNP')], [('in', 'IN'), ('the', 'DT'), ('Indian', 'JJ'), ('Imperial', 'NNP'), ('Police', 'NNP')], [('the', 'DT'), ('Indian', 'JJ'), ('Imperial', 'NNP'), ('Police', 'NNP')], [('in', 'IN'), ('Burma', 'NNP')], [('Burma', 'NNP')]]
Expected: ['George Orwell', 'an influential British writer and journalist']


iii. Orwell's literary career flourished in the 1940s.
Constituents: [[('Orwell', 'NNP')], [('literary', 'JJ'), ('career', 'NN')]]
Expected: ['in the Indian Imperial Police in Burma']


iv. Orwell was a master of the essay form.
Constituents: [[('Orwell', 'NNP')], [('a', 'DT'), ('master', 'NN')], [('of', 'IN'), ('the', 'DT'),