In [1]:
import spacy
from spacy.matcher import PhraseMatcher
import time

# Load spaCy model
nlp = spacy.load("en_core_web_md")

In [46]:
# Dictionary or list of known nouns (lowercased)
known_nouns = ["range", "price", "battery capacity", "end", "weight", "capacity"]

# PhraseMatcher

In [24]:
start_time = time.time()

# Initialize PhraseMatcher with lowercased matching
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp(text) for text in known_nouns]
matcher.add("KNOWN_NOUN", patterns)

# Parse the sentence
doc = nlp(sentence)
root = [token for token in doc if token.head == token][0]

# Match known phrases in the sentence
matches = matcher(doc)

# To avoid duplicate removal
tokens_to_remove = set()
results = []

for match_id, start, end in matches:
    span = doc[start:end]
    matched_text = span.text
    matched_noun = span.text.lower()

    # Check if part of root noun
    is_part_of_root_noun = any(token == root or token.head == root for token in span)

    related_adjs = []
    related_adps = []

    for token in span:
        for child in token.children:
            if child.pos_ == "ADJ" or child.dep_ == "amod":
                related_adjs.append(child.text)
                tokens_to_remove.add(child.i)
            if child.pos_ == "ADP":
                related_adps.append(child.text)
                tokens_to_remove.add(child.i)

    # Mark span tokens for removal
    for token in span:
        tokens_to_remove.add(token.i)

    results.append({
        "matched_noun": matched_noun,
        "matched_text": matched_text,
        "adjectives": list(set(related_adjs)),
        "adpositions": list(set(related_adps)),
        "is_part_of_root": is_part_of_root_noun
    })
end_time = time.time()

# Print results
print(f"Inference time: {(end_time - start_time) * 1000:.2f} ms")

# Build cleaned query
modified_query = " ".join(token.text for i, token in enumerate(doc) if i not in tokens_to_remove)

# Output
for entry in results:
    print(f"\n--- Info for matched noun: '{entry['matched_noun']}' ---")
    print("Matched Text:", entry["matched_text"])
    print("Adjectives:", entry["adjectives"])
    print("Adpositions:", entry["adpositions"])
    print("Is Part of Root:", entry["is_part_of_root"])

print("\n--- Modified Query (removed matched nouns and modifiers) ---")
print(modified_query)


Inference time: 39.80 ms

--- Info for matched noun: 'weight' ---
Matched Text: weight
Adjectives: ['light']
Adpositions: []
Is Part of Root: True

--- Info for matched noun: 'price' ---
Matched Text: price
Adjectives: ['low']
Adpositions: []
Is Part of Root: False

--- Modified Query (removed matched nouns and modifiers) ---
cheap phone in


# noun_chunks

In [13]:
start_time = time.time()

# Process the sentence
doc = nlp(sentence)
root = [token for token in doc if token.head == token][0]

# Normalize known noun entries for fast lookup
known_nouns_set = set(known_nouns)

# Result storage
results = []

# Step 1: Extract all noun chunks from the query
for chunk in doc.noun_chunks:
    chunk_text = chunk.text.lower().strip()

    # Check if full noun chunk or partial noun matches known nouns
    for known in known_nouns_set:
        if known in chunk_text:
            related_adjs = []
            related_adps = []
            is_part_of_root_noun = False

            # Check root attachment
            for token in chunk:
                if token == root or token.head == root:
                    is_part_of_root_noun = True

            # Check children for ADJ and ADP
            for token in chunk:
                for child in token.children:
                    if child.pos_ == "ADJ":
                        related_adjs.append(child.text)
                        tokens_to_remove.add(child.i)
                    if child.pos_ == "ADP":
                        related_adps.append(child.text)
                        tokens_to_remove.add(child.i)

            results.append({
                "matched_noun": known,
                "matched_chunk": chunk.text,
                "adjectives": list(set(related_adjs)),
                "adpositions": list(set(related_adps)),
                "is_part_of_root": is_part_of_root_noun
            })

end_time = time.time()

# Print results
print(f"Inference time: {(end_time - start_time) * 1000:.2f} ms")

# Output results
for entry in results:
    print(f"\n--- Info for noun: '{entry['matched_noun']}' ---")
    print("Matched Text:", entry["matched_chunk"])
    print("Adjectives:", entry["adjectives"])
    print("Adpositions:", entry["adpositions"])
    print("Is Part of Root:", entry["is_part_of_root"])

Inference time: 13.09 ms

--- Info for noun: 'weight' ---
Matched Text: light weight cheap phone
Adjectives: ['light', 'cheap']
Adpositions: ['in']
Is Part of Root: True

--- Info for noun: 'price' ---
Matched Text: low price
Adjectives: ['low']
Adpositions: []
Is Part of Root: False


# Standalone Token

In [48]:
start_time = time.time()
sentence = "light weight phone"
known_nouns_set = set(known_nouns)

doc = nlp(sentence)
root = [token for token in doc if token.head == token][0]

# Results and indices to remove
results = []
tokens_to_remove = set()

# Process each token (not chunks)
for token in doc:
    if token.text.lower() in known_nouns_set:
        known = token.text.lower()
        related_adjs = []
        related_adps = []
        #is_part_of_root_noun = (token == root or token.head == root)

        # Check for adjective and adposition children
        for child in token.children:
            if child.pos_ == "ADJ" or child.dep_ == "amod" :
                related_adjs.append(child.text)
                tokens_to_remove.add(child.i)
            if child.pos_ == "ADP":
                related_adps.append(child.text)
                tokens_to_remove.add(child.i)

        # Mark noun itself for removal
        tokens_to_remove.add(token.i)

        results.append({
            "matched_noun": known,
            "matched_token": token.text,
            "adjectives": list(set(related_adjs)),
            "adpositions": list(set(related_adps)),
            "is_part_of_root": is_part_of_root_noun
        })

end_time = time.time()

# Print results
print(f"Inference time: {(end_time - start_time) * 1000:.2f} ms")

# Reconstruct modified query (excluding removed tokens)
modified_query = " ".join(
    token.text for i, token in enumerate(doc) if i not in tokens_to_remove
)

# Output results
for entry in results:
    print(f"\n--- Info for noun: '{entry['matched_noun']}' ---")
    print("Matched Token:", entry["matched_token"])
    print("Adjectives:", entry["adjectives"])
    print("Adpositions:", entry["adpositions"])
    print("Is Part of Root:", entry["is_part_of_root"])

print("\n--- Modified Query (nouns and modifiers removed) ---")
print(modified_query)

Inference time: 8.72 ms

--- Info for noun: 'weight' ---
Matched Token: weight
Adjectives: []
Adpositions: []
Is Part of Root: False

--- Modified Query (nouns and modifiers removed) ---
light phone
