In [12]:
import spacy

def extract_noun_phrases(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    for c in doc.noun_chunks:
        print(c.text, c.root.text, c.root.dep_, c.root.head.text)
    return noun_phrases

sentence = "A blue horse took his gun and shot at the bank official"
noun_phrases = extract_noun_phrases(sentence)
print(noun_phrases)

A blue horse horse nsubj took
his gun gun dobj took
the bank official official pobj at
['A blue horse', 'his gun', 'the bank official']


In [3]:
import nltk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
from nltk import pos_tag, word_tokenize

def extract_phrases_nltk(sentence):
    tokens = word_tokenize(sentence)
    tagged_tokens = pos_tag(tokens)

    phrases = []
    current_phrase = []

    for i, (word, pos) in enumerate(tagged_tokens):
        current_phrase.append(word)
        
        # Check if the current word is a noun
        if pos.startswith("NN"):
            # Check if the previous word is a verb
            if i > 0 and tagged_tokens[i - 1][1].startswith("VB"):
                phrases.append(' '.join(current_phrase[-2:]))
            else:
                phrases.append(word)

            current_phrase = []

    return phrases

sentence = "A blue horse took his gun and shot at the bank official"
phrases = extract_phrases_nltk(sentence)
print(phrases)

['horse', 'gun', 'shot', 'bank', 'official']


In [10]:
import spacy

def extract_phrases(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)

    
    phrases = []
    for chunk in doc.noun_chunks:
        # Check if the root of the noun chunk has a left sibling that is a verb
        left_sibling = chunk.root.head
        if left_sibling.dep_ == 'ROOT' or left_sibling.pos_ == 'VERB':
            # Combine the verb and its associated words with the noun chunk
            combined_phrase = ' '.join([t.text for t in left_sibling.lefts] + [left_sibling.text] + [chunk.text])
            phrases.append(combined_phrase)
        else:
            phrases.append(chunk.text)

    return phrases

sentence = "A blue horse took his gun and shot at the bank official"
phrases = extract_phrases(sentence)
print(phrases)

['horse took A blue horse', 'horse took his gun', 'the bank official']


In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward
