# Generic Noun

In [35]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_md")

In [59]:
# User's input query
sentence = "show me mid range phone with low price"

# Dictionary or list of known nouns (lowercased)
known_nouns = ["range", "price", "end"]

doc = nlp(sentence)
root = [token for token in doc if token.head == token][0]
known_nouns_set = set(known_nouns)

# Results and indices to remove
results = []
tokens_to_remove = set()

#print(doc.noun_chunks)
# Process noun chunks
for chunk in doc.noun_chunks:
    chunk_text = chunk.text.lower().strip()

    for known in known_nouns_set:
        if known in chunk_text:
            related_adjs = []
            related_adps = []
            is_part_of_root_noun = False

            # Check root relation
            for token in chunk:
                if token == root or token.head == root:
                    is_part_of_root_noun = True

            # Find ADJ/ADP from children of chunk tokens
            for token in chunk:
                for child in token.children:
                    if child.pos_ == "ADJ":
                        related_adjs.append(child.text)
                        tokens_to_remove.add(child.i)
                    elif child.pos_ == "ADP":
                        related_adps.append(child.text)
                        tokens_to_remove.add(child.i)

            # Mark entire noun chunk for removal
            for token in chunk:
               if str(token)==known:
                    tokens_to_remove.add(token.i)

            results.append({
                "matched_noun": known,
                "matched_chunk": chunk.text,
                "adjectives": list(set(related_adjs)),
                "adpositions": list(set(related_adps)),
                "is_part_of_root": is_part_of_root_noun
            })

# Reconstruct sentence without removed tokens
modified_query = " ".join([
    token.text for i, token in enumerate(doc) if i not in tokens_to_remove
])

# Output results
for entry in results:
    print(f"\n--- Info for noun: '{entry['matched_noun']}' ---")
    print("Matched Text:", entry["matched_noun"])
    print("Adjectives:", entry["adjectives"])
    print("Adpositions:", entry["adpositions"])
    print("Is Part of Root:", entry["is_part_of_root"])

print("\n--- Modified Query (noun chunks and modifiers removed) ---")
print(modified_query)


--- Info for noun: 'range' ---
Matched Text: range
Adjectives: ['mid']
Adpositions: ['with']
Is Part of Root: True

--- Info for noun: 'price' ---
Matched Text: price
Adjectives: ['low']
Adpositions: []
Is Part of Root: False

--- Modified Query (noun chunks and modifiers removed) ---
show me phone


In [57]:
type(str(token))

str

In [39]:
# User's input query
sentence = "show me mid range phone with low price"

# Dictionary or list of known nouns (lowercased)
known_nouns = ["range", "price", "end"]

# Process the sentence
doc = nlp(sentence)
root = [token for token in doc if token.head == token][0]
known_nouns_set = set(known_nouns)

# Results and indices to remove
results = []
tokens_to_remove = set()

# Process noun chunks
for chunk in doc.noun_chunks:
    for token in chunk:
        # Only check noun or proper noun tokens
        if token.pos_ in ["NOUN", "PROPN"] and token.text.lower() in known_nouns_set:
            related_adjs = []
            related_adps = []
            is_part_of_root_noun = token == root or token.head == root

            # Check children for adjectives/adpositions
            for child in token.children:
                if child.pos_ == "ADJ":
                    related_adjs.append(child.text)
                    tokens_to_remove.add(child.i)
                elif child.pos_ == "ADP":
                    related_adps.append(child.text)
                    tokens_to_remove.add(child.i)

            # Mark the matched noun itself
            tokens_to_remove.add(token.i)

            results.append({
                "matched_noun": token.text,
                "matched_chunk": chunk.text,
                "adjectives": list(set(related_adjs)),
                "adpositions": list(set(related_adps)),
                "is_part_of_root": is_part_of_root_noun
            })

# Reconstruct sentence without removed tokens
modified_query = " ".join([
    token.text for i, token in enumerate(doc) if i not in tokens_to_remove
])

# Output results
for entry in results:
    print(f"\n--- Info for noun: '{entry['matched_noun']}' ---")
    print("Matched Text:", entry["matched_chunk"])
    print("Adjectives:", entry["adjectives"])
    print("Adpositions:", entry["adpositions"])
    print("Is Part of Root:", entry["is_part_of_root"])

print("\n--- Modified Query (only matched nouns + modifiers removed) ---")
print(modified_query)



--- Info for noun: 'range' ---
Matched Text: mid range phone
Adjectives: []
Adpositions: []
Is Part of Root: False

--- Info for noun: 'price' ---
Matched Text: low price
Adjectives: ['low']
Adpositions: []
Is Part of Root: False

--- Modified Query (only matched nouns + modifiers removed) ---
show me mid phone with


# Dependency Tree Approach

In [3]:
import spacy
from spacy import displacy

# Load medium English model
nlp = spacy.load("en_core_web_md")

In [44]:
import spacy
from spacy import displacy

# Load medium English model
#nlp = spacy.load("en_core_web_md")
# Input sentence
sentence = "find stylish sunglasses for men"

# Process the sentence
doc = nlp(sentence)

# Define target noun phrase (case insensitive)
target_noun = "sunglasses"

# Variables to store results
related_adjs = []
related_adps = []
is_part_of_root_noun = False

# Identify root token
root = [token for token in doc if token.head == token][0]

# Find noun chunks to check for the target noun
for chunk in doc.noun_chunks:
    if target_noun in chunk.text.lower():
        
        # Check if it's directly the root or attached to the root
        for token in chunk:
            if token == root or token.head == root:
                is_part_of_root_noun = True
        
        # Find related ADJ and ADP via children
        for token in chunk:
            for child in token.children:
                if child.pos_ == "ADJ":
                    related_adjs.append(child.text)
                if child.pos_ == "ADP":
                    related_adps.append(child.text)

# Remove duplicates
related_adjs = list(set(related_adjs))
related_adps = list(set(related_adps))

# Print results
print(f"Adjectives related to '{target_noun}': {related_adjs}")
print(f"Adpositions related to '{target_noun}': {related_adps}")
print(f"Is '{target_noun}' part of the root noun or attached to the root? {is_part_of_root_noun}")


Adjectives related to 'sunglasses': ['stylish']
Adpositions related to 'sunglasses': ['for']
Is 'sunglasses' part of the root noun or attached to the root? True


In [None]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load Ground Truth Data
with open('test_queries.json', 'r') as f:
    test_data = json.load(f)

# 🔧 Dummy Infer Function (Replace with your actual extraction logic)
def infer(sentence):
    doc = nlp(sentence)
    
    # Identify root token
    root = [token for token in doc if token.head == token][0]
    
    results = []
    seen_nouns = set()
    id_counter = 1

    for chunk in doc.noun_chunks:
        noun_text = chunk.text.lower()

        # Avoid duplicates for multi-word noun chunks
        if noun_text in seen_nouns:
            continue
        seen_nouns.add(noun_text)

        related_adjs = []
        related_adps = []
        is_part_of_root_noun = False

        # Check root attachment
        for token in chunk:
            if token == root or token.head == root:
                is_part_of_root_noun = True

        # Find related adjectives and adpositions
        for token in chunk:
            for child in token.children:
                if child.pos_ == "ADJ":
                    related_adjs.append(child.text)
                if child.pos_ == "ADP":
                    related_adps.append(child.text)

        results.append({
            "id": id_counter,
            "noun": chunk.text.strip(),
            "adj": related_adjs[0] if related_adjs else "",
            "adp": related_adps[0] if related_adps else "",
            "part_of_root": is_part_of_root_noun
        })

        id_counter += 1

    return results
    return [noun_entry for noun_entry in next(item['nouns'] for item in test_data if item['input'] == query)]

# Evaluation Containers
true_adj = []
pred_adj = []

true_adp = []
pred_adp = []

true_root = []
pred_root = []

# Evaluation Loop
for item in test_data:
    query = item['input']
    gt_nouns = item['nouns']

    # Run your model's extraction logic
    predicted_nouns = infer(query)  # This should mirror the format: [{'noun': ..., 'adj': ..., 'adp': ..., 'part_of_root': ...}]

    # Match by noun text
    for gt in gt_nouns:
        # Find matching noun in predictions
        match = next((pred for pred in predicted_nouns if pred['noun'].lower() == gt['noun'].lower()), None)

        if match:
            true_adj.append(gt['adj'])
            pred_adj.append(match['adj'])

            true_adp.append(gt['adp'])
            pred_adp.append(match['adp'])

            true_root.append(gt['part_of_root'])
            pred_root.append(match['part_of_root'])
        else:
            # Noun not found in prediction - count as miss
            true_adj.append(gt['adj'])
            pred_adj.append('')  # No match predicted

            true_adp.append(gt['adp'])
            pred_adp.append('')

            true_root.append(gt['part_of_root'])
            pred_root.append(False)

# Helper to convert string matches to binary labels
def to_binary(true_list, pred_list):
    return [1 if t else 0 for t in true_list], [1 if p else 0 for p in pred_list]

# 🎯 Calculate Metrics for Adjectives
adj_true_bin, adj_pred_bin = to_binary([bool(a) for a in true_adj], [bool(a) for a in pred_adj])
print("Adjective Metrics:")
print(f"Precision: {precision_score(adj_true_bin, adj_pred_bin):.2f}")
print(f"Recall: {recall_score(adj_true_bin, adj_pred_bin):.2f}")
print(f"F1 Score: {f1_score(adj_true_bin, adj_pred_bin):.2f}")
print(f"Accuracy: {accuracy_score(adj_true_bin, adj_pred_bin):.2f}")
print()

# 🎯 Calculate Metrics for Adpositions
adp_true_bin, adp_pred_bin = to_binary([bool(a) for a in true_adp], [bool(a) for a in pred_adp])
print("Adposition Metrics:")
print(f"Precision: {precision_score(adp_true_bin, adp_pred_bin):.2f}")
print(f"Recall: {recall_score(adp_true_bin, adp_pred_bin):.2f}")
print(f"F1 Score: {f1_score(adp_true_bin, adp_pred_bin):.2f}")
print(f"Accuracy: {accuracy_score(adp_true_bin, adp_pred_bin):.2f}")
print()

# 🎯 Calculate Metrics for part_of_root
root_true_bin, root_pred_bin = to_binary(true_root, pred_root)
print("Part of Root Metrics:")
print(f"Precision: {precision_score(root_true_bin, root_pred_bin):.2f}")
print(f"Recall: {recall_score(root_true_bin, root_pred_bin):.2f}")
print(f"F1 Score: {f1_score(root_true_bin, root_pred_bin):.2f}")
print(f"Accuracy: {accuracy_score(root_true_bin, root_pred_bin):.2f}")


In [37]:
import spacy
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load spaCy model
#nlp = spacy.load("en_core_web_md")

# Inference Function
def infer(sentence, target_noun):
    doc = nlp(sentence)
    
    root = [token for token in doc if token.head == token][0]
    
    related_adjs = []
    related_adps = []
    is_part_of_root_noun = False

    target_noun = target_noun.lower().strip()

    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        
        if target_noun in chunk_text:
            for token in chunk:
                if token == root or token.head == root:
                    is_part_of_root_noun = True

            for token in chunk:
                for child in token.children:
                    if child.pos_ == "ADJ":
                        related_adjs.append(child.text.lower().strip())
                    if child.pos_ == "ADP":
                        related_adps.append(child.text.lower().strip())
    
    # Remove duplicates
    related_adjs = list(set(related_adjs))
    related_adps = list(set(related_adps))

    return {
        "adj": related_adjs,
        "adp": related_adps,
        "part_of_root": is_part_of_root_noun
    }

# Load Ground Truth Data
with open('test_json.json', 'r') as f:
    test_data = json.load(f)

# Evaluation Containers
true_adj = []
pred_adj = []

true_adp = []
pred_adp = []

true_root = []
pred_root = []

# Evaluation Loop
for item in test_data:
    query = item['input']
    gt_nouns = item['nouns']

    for gt in gt_nouns:
        target_noun = gt['noun']
        
        pred = infer(query, target_noun)
        
        gt_adjs = [a.lower().strip() for a in gt['adj']]
        gt_adps = [a.lower().strip() for a in gt['adp']]

        pred_adjs = [a.lower().strip() for a in pred['adj']]
        pred_adps = [a.lower().strip() for a in pred['adp']]

        true_adj.extend([1] * len(gt_adjs))
        pred_adj.extend([1 if adj in pred_adjs else 0 for adj in gt_adjs])

        true_adp.extend([1] * len(gt_adps))
        pred_adp.extend([1 if adp in pred_adps else 0 for adp in gt_adps])

        true_root.append(gt['part_of_root'])
        pred_root.append(pred['part_of_root'])

# Metric Helper
def print_metrics(name, true_list, pred_list):
    if len(true_list) == 0:
        print(f"{name} Metrics: No ground truth labels provided.\n")
        return
    print(f"{name} Metrics:")
    print(f"Precision: {precision_score(true_list, pred_list):.2f}")
    print(f"Recall: {recall_score(true_list, pred_list):.2f}")
    print(f"F1 Score: {f1_score(true_list, pred_list):.2f}")
    print(f"Accuracy: {accuracy_score(true_list, pred_list):.2f}\n")

# 🎯 Output Metrics
print_metrics("Adjective", true_adj, pred_adj)
#print_metrics("Adposition", true_adp, pred_adp)
print_metrics("Part of Root", [int(r) for r in true_root], [int(r) for r in pred_root])


Adjective Metrics:
Precision: 1.00
Recall: 0.76
F1 Score: 0.87
Accuracy: 0.76

Part of Root Metrics:
Precision: 1.00
Recall: 0.80
F1 Score: 0.89
Accuracy: 0.91



# Final Script -- Batch Infer

In [40]:
import spacy
import json
import csv
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Inference Function
def infer(sentence, target_noun):
    doc = nlp(sentence)
    
    root = [token for token in doc if token.head == token][0]
    
    related_adjs = []
    related_adps = []
    is_part_of_root_noun = False

    target_noun = target_noun.lower().strip()

    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        
        if target_noun in chunk_text:
            for token in chunk:
                if token == root or token.head == root:
                    is_part_of_root_noun = True

            for token in chunk:
                for child in token.children:
                    if child.pos_ == "ADJ":
                        related_adjs.append(child.text.lower().strip())
                    if child.pos_ == "ADP":
                        related_adps.append(child.text.lower().strip())
    
    related_adjs = list(set(related_adjs))
    related_adps = list(set(related_adps))

    return {
        "adj": related_adjs,
        "adp": related_adps,
        "part_of_root": is_part_of_root_noun
    }

# Load Ground Truth Data
with open('test_json.json', 'r') as f:
    test_data = json.load(f)

# Containers for Metrics and Reports
true_adj = []
pred_adj = []
adj_report = []

true_adp = []
pred_adp = []
adp_report = []

true_root = []
pred_root = []
root_report = []

# Evaluation Loop
for item in test_data:
    query = item['input']
    gt_nouns = item['nouns']

    for gt in gt_nouns:
        target_noun = gt['noun']
        
        pred = infer(query, target_noun)
        
        gt_adjs = [a.lower().strip() for a in gt['adj']]
        gt_adps = [a.lower().strip() for a in gt['adp']]

        pred_adjs = [a.lower().strip() for a in pred['adj']]
        pred_adps = [a.lower().strip() for a in pred['adp']]

        # Adjective Evaluation & Report
        for adj in gt_adjs:
            true_adj.append(1)
            pred_adj.append(1 if adj in pred_adjs else 0)
            adj_report.append({
                "input_query": query,
                "noun": target_noun,
                "gt_adj": adj,
                "pred_adj": ", ".join(pred_adjs),
                "match": adj in pred_adjs
            })

        # Adposition Evaluation & Report
        for adp in gt_adps:
            true_adp.append(1)
            pred_adp.append(1 if adp in pred_adps else 0)
            adp_report.append({
                "input_query": query,
                "noun": target_noun,
                "gt_adp": adp,
                "pred_adp": ", ".join(pred_adps),
                "match": adp in pred_adps
            })

        # Part of Root Evaluation & Report
        true_root.append(gt['part_of_root'])
        pred_root.append(pred['part_of_root'])
        root_report.append({
            "input_query": query,
            "noun": target_noun,
            "gt_part_of_root": gt['part_of_root'],
            "pred_part_of_root": pred['part_of_root'],
            "match": gt['part_of_root'] == pred['part_of_root']
        })

# Metric Helper
def print_metrics(name, true_list, pred_list):
    if len(true_list) == 0:
        print(f"{name} Metrics: No ground truth labels provided.\n")
        return
    print(f"{name} Metrics:")
    print(f"Precision: {precision_score(true_list, pred_list):.2f}")
    print(f"Recall: {recall_score(true_list, pred_list):.2f}")
    print(f"F1 Score: {f1_score(true_list, pred_list):.2f}")
    print(f"Accuracy: {accuracy_score(true_list, pred_list):.2f}\n")

# 🎯 Output Metrics
print_metrics("Adjective", true_adj, pred_adj)
print_metrics("Adposition", true_adp, pred_adp)
print_metrics("Part of Root", [int(r) for r in true_root], [int(r) for r in pred_root])

# Write CSV Reports
with open("adj_report.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["input_query", "noun", "gt_adj", "pred_adj", "match"])
    writer.writeheader()
    writer.writerows(adj_report)

with open("adp_report.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["input_query", "noun", "gt_adp", "pred_adp", "match"])
    writer.writeheader()
    writer.writerows(adp_report)

with open("part_of_root_report.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["input_query", "noun", "gt_part_of_root", "pred_part_of_root", "match"])
    writer.writeheader()
    writer.writerows(root_report)

print("✅ Reports generated: adj_report.csv, adp_report.csv, part_of_root_report.csv")


Adjective Metrics:
Precision: 1.00
Recall: 0.76
F1 Score: 0.87
Accuracy: 0.76

Adposition Metrics:
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
Accuracy: 0.00

Part of Root Metrics:
Precision: 1.00
Recall: 0.80
F1 Score: 0.89
Accuracy: 0.91

✅ Reports generated: adj_report.csv, adp_report.csv, part_of_root_report.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Standalone Test

In [11]:
import spacy
import json
import csv
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import time

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Inference Function
def infer(sentence, target_noun):
    doc = nlp(sentence)
    
    root = [token for token in doc if token.head == token][0]
    
    related_adjs = []
    related_adps = []
    is_part_of_root_noun = False

    target_noun = target_noun.lower().strip()

    for chunk in doc.noun_chunks:
        chunk_text = chunk.text.lower().strip()
        
        if target_noun in chunk_text:
            for token in chunk:
                if token == root or token.head == root:
                    is_part_of_root_noun = True

            for token in chunk:
                for child in token.children:
                    if child.pos_ == "ADJ":
                        related_adjs.append(child.text.lower().strip())
                    if child.pos_ == "ADP":
                        related_adps.append(child.text.lower().strip())
    
    related_adjs = list(set(related_adjs))
    related_adps = list(set(related_adps))

    return {
        "adj": related_adjs,
        "adp": related_adps,
        "part_of_root": is_part_of_root_noun
    }

In [15]:
query = "show me cheap and affordable samsung s24 with compatible charger"
target_noun = "samsung s24"
start_time = time.time()
pred = infer(query, target_noun)
end_time = time.time()
print(pred)
inference_time = (end_time - start_time) * 1000  # in milliseconds
print(f"Inference Time: {inference_time:.3f} ms")

{'adj': ['affordable', 'cheap'], 'adp': ['with'], 'part_of_root': True}
Inference Time: 8.258 ms


# Test1

In [18]:
import spacy
import time

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Bulk Inference Function
def infer_bulk(sentence, target_nouns):
    doc = nlp(sentence)
    root = [token for token in doc if token.head == token][0]

    results = {}

    # Preprocess: lowercase and strip target nouns
    target_nouns = [noun.lower().strip() for noun in target_nouns]

    for target_noun in target_nouns:
        related_adjs = []
        related_adps = []
        is_part_of_root_noun = False

        for chunk in doc.noun_chunks:
            chunk_text = chunk.text.lower().strip()
            if target_noun in chunk_text:
                for token in chunk:
                    if token == root or token.head == root:
                        is_part_of_root_noun = True

                for token in chunk:
                    for child in token.children:
                        if child.pos_ == "ADJ":
                            related_adjs.append(child.text.lower().strip())
                        if child.pos_ == "ADP":
                            related_adps.append(child.text.lower().strip())

        results[target_noun] = {
            "adj": list(set(related_adjs)),
            "adp": list(set(related_adps)),
            "part_of_root": is_part_of_root_noun
        }

    return results


In [22]:
query = "s24 ans s25 and s26 and s27"
target_nouns = ["s24", "s25", "s26", "s27"]

start_time = time.time()
predictions = infer_bulk(query, target_nouns)
end_time = time.time()

for noun, result in predictions.items():
    print(f"{noun} -> {result}")

inference_time = (end_time - start_time) * 1000  # in milliseconds
print(f"\nTotal Inference Time: {inference_time:.3f} ms")


s24 -> {'adj': [], 'adp': [], 'part_of_root': True}
s25 -> {'adj': [], 'adp': [], 'part_of_root': True}
s26 -> {'adj': [], 'adp': [], 'part_of_root': True}
s27 -> {'adj': [], 'adp': [], 'part_of_root': False}

Total Inference Time: 15.829 ms


# Test2

In [26]:
import spacy
import time

# Load spaCy model
nlp = spacy.load("en_core_web_md")

def infer_bulk_dep(sentence, target_nouns):
    doc = nlp(sentence)
    root = [token for token in doc if token.head == token][0]
    print(root)
    # Normalize target nouns
    target_nouns = [tn.lower().strip() for tn in target_nouns]
    results = {tn: {"adj": [], "adp": [], "part_of_root": False} for tn in target_nouns}

    # Build noun spans for all target nouns
    target_token_map = {}
    for noun in target_nouns:
        for i in range(len(doc)):
            for j in range(i+1, len(doc)+1):
                span = doc[i:j]
                if span.text.lower().strip() == noun:
                    for token in span:
                        target_token_map.setdefault(noun, set()).add(token)
    
    # Traverse tokens only once
    for token in doc:
        for noun, tokens in target_token_map.items():
            if token in tokens:
                # Check for adjective modifier
                for child in token.children:
                    if child.dep_ == "amod":  # adjectival modifier
                        results[noun]["adj"].append(child.text.lower())

                # Check for prepositional modifier
                for child in token.children:
                    if child.dep_ == "prep":
                        results[noun]["adp"].append(child.text.lower())

                # Check if this noun is part of the root phrase
                if token == root or token.head == root:
                    results[noun]["part_of_root"] = True

    # Remove duplicates
    for noun in results:
        results[noun]["adj"] = list(set(results[noun]["adj"]))
        results[noun]["adp"] = list(set(results[noun]["adp"]))

    return results


In [32]:
query = "cheap s24 and best s25 and latest s26 and s27"
target_nouns = ["s24", "s25", "s26", "s27"]

start_time = time.time()
predictions = infer_bulk_dep(query, target_nouns)
end_time = time.time()

for noun, result in predictions.items():
    print(f"{noun} -> {result}")

inference_time = (end_time - start_time) * 1000
print(f"\nTotal Inference Time: {inference_time:.3f} ms")


s24 -> {'adj': ['cheap'], 'adp': [], 'part_of_root': True}
s25 -> {'adj': ['best'], 'adp': [], 'part_of_root': True}
s26 -> {'adj': ['latest'], 'adp': [], 'part_of_root': False}
s27 -> {'adj': [], 'adp': [], 'part_of_root': False}

Total Inference Time: 11.883 ms


# Test3

In [34]:
import spacy
import time
from spacy.matcher import PhraseMatcher

# Load spaCy model
nlp = spacy.load("en_core_web_md")

def infer_optimized(sentence, target_nouns):
    doc = nlp(sentence)

    # Normalize target nouns
    target_nouns_norm = [noun.lower().strip() for noun in target_nouns]

    # Setup PhraseMatcher to find target noun spans
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    noun_patterns = [nlp.make_doc(noun) for noun in target_nouns_norm]
    matcher.add("TARGET_NOUNS", noun_patterns)

    matches = matcher(doc)

    # Map matched noun spans to their string keys
    span_map = {}  # noun_text -> set of token indices
    for match_id, start, end in matches:
        span_text = doc[start:end].text.lower().strip()
        span_map.setdefault(span_text, set()).update(range(start, end))

    # Preprocess: Find root token once
    root = next((token for token in doc if token.head == token), None)

    # Prepare results
    results = {noun: {"adj": [], "adp": [], "part_of_root": False} for noun in target_nouns_norm}

    # One-pass token analysis
    for token in doc:
        for noun, indices in span_map.items():
            if token.i in indices:
                # Check if part of root phrase
                if token == root or token.head == root:
                    results[noun]["part_of_root"] = True

                # Collect modifiers
                for child in token.children:
                    if child.dep_ == "amod" and child.pos_ == "ADJ":
                        results[noun]["adj"].append(child.text.lower())
                    elif child.dep_ == "prep" and child.pos_ == "ADP":
                        results[noun]["adp"].append(child.text.lower())

    # Remove duplicates
    for noun in results:
        results[noun]["adj"] = list(set(results[noun]["adj"]))
        results[noun]["adp"] = list(set(results[noun]["adp"]))

    return results


In [38]:
if __name__ == "__main__":
    query = "cheap s24 and best s25 and latest s26 and s27"
    target_nouns = ["s24", "s25", "s26", "s27"]

    start_time = time.time()
    predictions = infer_optimized(query, target_nouns)
    end_time = time.time()

    for noun, result in predictions.items():
        print(f"{noun} -> {result}")

    inference_time = (end_time - start_time) * 1000
    print(f"\nTotal Inference Time: {inference_time:.3f} ms")


s24 -> {'adj': ['cheap'], 'adp': [], 'part_of_root': True}
s25 -> {'adj': ['best'], 'adp': [], 'part_of_root': True}
s26 -> {'adj': ['latest'], 'adp': [], 'part_of_root': False}
s27 -> {'adj': [], 'adp': [], 'part_of_root': False}

Total Inference Time: 7.514 ms


# Similarity Check

In [4]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl (100 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 12.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.27.1 rapidfuzz-3.13.0


In [14]:
import Levenshtein

def char_level_similarity(gt_noun, ner_noun, threshold=0.5):
    """
    Computes character-level similarity using normalized Levenshtein distance.
    
    Args:
        gt_noun (str): Ground truth string.
        ner_noun (str): Predicted string.
        threshold (float): Similarity threshold.
        
    Returns:
        str: 'yes' if similarity >= threshold, else 'no'.
    """
    gt = gt_noun.lower().strip()
    ner = ner_noun.lower().strip()

    if not gt or not ner:
        return "no"

    similarity = 1 - (Levenshtein.distance(gt, ner) / max(len(gt), len(ner)))

    return "yes" if similarity >= threshold else "no"


In [16]:
print(char_level_similarity("samsung s25 ultra", "s25 ultra"))         # likely 'yes'

yes


# Instatnt Test

In [13]:

# Your e-commerce query
query = "case for iphone"

# Process the query
doc = nlp(query)

# Print dependency tree details
print(f"{'Token':<10} {'Dep':<10} {'Head':<10} {'POS':<10} {'Children'}")
print("-" * 50)
for token in doc:
    children = [child.text for child in token.children]
    print(f"{token.text:<10} {token.dep_:<10} {token.head.text:<10} {token.pos_:<10} {children}")

# OPTIONAL: Visualize dependency tree (works in Jupyter or web contexts)
displacy.serve(doc, style="dep")

Token      Dep        Head       POS        Children
--------------------------------------------------
case       ROOT       case       NOUN       ['for']
for        prep       case       ADP        ['iphone']
iphone     pobj       for        NOUN       []



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# POS Approach

In [3]:
import spacy

#nlp = spacy.load("en_core_web_sm")

# Define connectors with grammar-based rules
left_primary = {"with", "for", "under"}
right_primary = { "of", "to", "from", "about", "on", "by"}

def extract_info(text):
    doc = nlp(text)

    # Initialize outputs
    primary = {"main_noun": "", "prev_nouns": "", "adj": "", "adp": ""}
    secondary = {"main_noun": "", "prev_nouns": "", "adj": "", "adp": ""}

    connector_token = None

    # Step 1: Find the first relevant connector
    for token in doc:
        if token.text.lower() in left_primary.union(right_primary) and token.pos_ == "ADP":
            connector_token = token
            break

    if connector_token:
        # Process left side
        left_tokens = list(doc[:connector_token.i])
        left_main_noun, left_prev_nouns = "", ""
        for i in reversed(range(len(left_tokens))):
            token = left_tokens[i]
            if token.pos_ in {"NOUN", "PROPN","NUM"}:
                left_main_noun = token.text
                prev_nouns = []
                for j in reversed(range(i)):
                    if left_tokens[j].pos_ in {"NOUN", "PROPN","NUM"}:
                        prev_nouns.insert(0, left_tokens[j].text)
                    else:
                        break
                left_prev_nouns = " ".join(prev_nouns)
                break

        # Process right side
        right_tokens = list(doc[connector_token.i + 1:])
        right_main_noun, right_prev_nouns = "", ""
        for i, token in enumerate(right_tokens):
            if token.pos_ in {"NOUN", "PROPN","NUM"}:
                right_main_noun = token.text
                prev_nouns = []
                for j in reversed(range(i)):
                    if right_tokens[j].pos_ in {"NOUN", "PROPN","NUM"}:
                        prev_nouns.insert(0, right_tokens[j].text)
                    else:
                        break
                right_prev_nouns = " ".join(prev_nouns)
                break

        connector = connector_token.text.lower()

        # Assign primary and secondary based on connector
        if connector in left_primary:
            primary["main_noun"] = left_main_noun
            primary["prev_nouns"] = left_prev_nouns
            secondary["main_noun"] = right_main_noun
            secondary["prev_nouns"] = right_prev_nouns
        elif connector in right_primary:
            primary["main_noun"] = right_main_noun
            primary["prev_nouns"] = right_prev_nouns
            secondary["main_noun"] = left_main_noun
            secondary["prev_nouns"] = left_prev_nouns

        # Step 2: Find adjectives, adpositions, referring to nouns
        for token in doc:
            if token.head.text == primary["main_noun"]:
                if token.pos_ == "ADJ":
                    primary["adj"] = token.text
                if token.pos_ == "ADP":
                    primary["adp"] = token.text
            if token.head.text == secondary["main_noun"]:
                if token.pos_ == "ADJ":
                    secondary["adj"] = token.text
                if token.pos_ == "ADP":
                    secondary["adp"] = token.text

    return {
        "Primary main noun": primary["main_noun"],
        "Primary prev nouns": primary["prev_nouns"],
        "Adj ref Primary noun": primary["adj"],
        "ADP ref Primary noun": primary["adp"],
        "Secondary main noun": secondary["main_noun"],
        "Secondary prev nouns": secondary["prev_nouns"],
        "Adj ref Secondary noun": secondary["adj"],
        "ADP ref Secondary noun": secondary["adp"],
    }

# Test Example
text = "phone sort by price"
result = extract_info(text)
for key, value in result.items():
    print(f"{key}: {value if value else 'N/A'}")

Primary main noun: price
Primary prev nouns: N/A
Adj ref Primary noun: N/A
ADP ref Primary noun: N/A
Secondary main noun: sort
Secondary prev nouns: phone
Adj ref Secondary noun: N/A
ADP ref Secondary noun: by


In [33]:
import spacy

#nlp = spacy.load("en_core_web_sm")

# Define connectors with grammar-based rules
left_primary = {"with", "for", "under","by","on"}
right_primary = { }

noun_pos_tags = {"NOUN", "PROPN", "NUM"}

def extract_info(text):
    doc = nlp(text)

    primary = {"main_noun": "", "prev_nouns": "", "adj": "", "adp": ""}
    secondary = {"main_noun": "", "next_nouns": "", "adj": "", "adp": ""}

    connector_token = None

    # Find connector
    for token in doc:
        if token.text.lower() in left_primary.union(right_primary) and token.pos_ == "ADP":
            connector_token = token
            break

    if connector_token:
        left_tokens = list(doc[:connector_token.i])
        right_tokens = list(doc[connector_token.i + 1:])

        left_main_noun, left_prev_nouns = "", ""
        right_main_noun, right_next_nouns = "", ""

        # Find left main noun & previous consecutive nouns
        for i in reversed(range(len(left_tokens))):
            token = left_tokens[i]
            if token.pos_ in noun_pos_tags:
                left_main_noun = token.text
                prev_nouns = []
                for j in reversed(range(i)):
                    if left_tokens[j].pos_ in noun_pos_tags:
                        prev_nouns.insert(0, left_tokens[j].text)
                    else:
                        break
                left_prev_nouns = " ".join(prev_nouns)
                break

        # Find right main noun & next consecutive nouns
        for i, token in enumerate(right_tokens):
            if token.pos_ in noun_pos_tags:
                right_main_noun = token.text
                next_nouns = []
                for j in range(i + 1, len(right_tokens)):
                    if right_tokens[j].pos_ in noun_pos_tags:
                        next_nouns.append(right_tokens[j].text)
                    else:
                        break
                right_next_nouns = " ".join(next_nouns)
                break

        connector = connector_token.text.lower()

        if connector in left_primary:
            primary["main_noun"] = left_main_noun
            primary["prev_nouns"] = left_prev_nouns
            secondary["main_noun"] = right_main_noun
            secondary["next_nouns"] = right_next_nouns
        elif connector in right_primary:
            primary["main_noun"] = right_main_noun
            primary["prev_nouns"] = right_next_nouns
            secondary["main_noun"] = left_main_noun
            secondary["next_nouns"] = left_prev_nouns

        # Find adjectives, adpositions, verbs referring to nouns
        for token in doc:
            if token.head.text == primary["main_noun"]:
                if token.pos_ == "ADJ":
                    primary["adj"] = token.text
                if token.pos_ == "ADP":
                    primary["adp"] = token.text

            if token.head.text == secondary["main_noun"]:
                if token.pos_ == "ADJ":
                    secondary["adj"] = token.text
                if token.pos_ == "ADP":
                    secondary["adp"] = token.text


    return {
        "Primary main noun": primary["main_noun"],
        "Primary prev nouns": primary["prev_nouns"],
        "Adj ref Primary noun": primary["adj"],
        "ADP ref Primary noun": primary["adp"],
        "Secondary main noun": secondary["main_noun"],
        "Secondary next nouns": secondary["next_nouns"],
        "Adj ref Secondary noun": secondary["adj"],
        "ADP ref Secondary noun": secondary["adp"],
    }

# Test Example
text = "s24 with comapatible charger"
result = extract_info(text)

# Pretty print
for key, value in result.items():
    print(f"{key}: {value if value else '-'}")


Primary main noun: s24
Primary prev nouns: -
Adj ref Primary noun: -
ADP ref Primary noun: with
Secondary main noun: charger
Secondary next nouns: -
Adj ref Secondary noun: comapatible
ADP ref Secondary noun: -


In [11]:
import spacy

# Input sentence
sentence = "sort by price"

# Process the sentence
doc = nlp(sentence)

# Print tokens and their POS tags
print(f"{'Token':15} {'POS':10} {'Explanation'}")
print("-" * 40)
for token in doc:
    print(f"{token.text:15} {token.pos_:10} {spacy.explain(token.pos_)}")


Token           POS        Explanation
----------------------------------------
sort            ADV        adverb
by              ADP        adposition
price           NOUN       noun


In [21]:
import random
import json

# Word banks
nouns = ["phone", "laptop", "charger", "headphones", "tablet", "watch", "camera", "speaker", "mic", "router", "printer", "monitor", "keyboard", "mouse", "projector", "powerbank", "case", "screen", "earbuds", "gamepad"]
adjs = ["cheap", "fast", "wireless", "affordable", "powerful", "compatible", "new", "slim", "durable", "portable", "lightweight", "stylish", "reliable", "advanced", "smart"]
nums = ["10000", "500", "65w", "256gb", "2tb", "50", "100", "2000", "300", "150"]
adps = ["with", "for", "under", "above", "beside", "without"]

# 50 sentence templates
templates = [
    "Give me {adj1} {noun1} with {adj2} {noun2}.",
    "I want {noun1} for {adj2} {noun2}.",
    "Find {adj1} {noun1} under {num}.",
    "Looking for {adj1} {noun1} with {adj2} {noun2}.",
    "Buy {adj1} {noun1} with {adj2} {noun2}.",
    "Get {noun1} for {adj2} {noun2}.",
    "Search {adj1} {noun1} under {num}.",
    "Order {adj1} {noun1} with {adj2} {noun2}.",
    "Cheap {noun1} for {adj2} {noun2}.",
    "Looking for {adj1} {noun1} above {num}.",
    "I need {adj1} {noun1} beside {adj2} {noun2}.",
    "Get {noun1} without {adj2} {noun2}.",
    "Find {adj1} {noun1} above {num}.",
    "Order {adj1} {noun1} without {adj2} {noun2}.",
    "New {noun1} with {adj2} {noun2}.",
    "Affordable {noun1} for {adj2} {noun2}.",
    "Looking for {adj1} {noun1} with {adj2} {noun2}.",
    "Buy {adj1} {noun1} for {adj2} {noun2}.",
    "Search for {adj1} {noun1} under {num}.",
    "Get {adj1} {noun1} above {num}.",
    "Order {adj1} {noun1} beside {adj2} {noun2}.",
    "Find {adj1} {noun1} without {adj2} {noun2}.",
    "Need {adj1} {noun1} with {adj2} {noun2}.",
    "Looking for {adj1} {noun1} for {adj2} {noun2}.",
    "Cheap {adj1} {noun1} under {num}.",
    "Buy {adj1} {noun1} with {adj2} {noun2}.",
    "Order {adj1} {noun1} for {adj2} {noun2}.",
    "I want {adj1} {noun1} with {adj2} {noun2}.",
    "Need {adj1} {noun1} under {num}.",
    "Searching for {adj1} {noun1} above {num}.",
    "Purchase {adj1} {noun1} without {adj2} {noun2}.",
    "Get {adj1} {noun1} with {adj2} {noun2}.",
    "Find {adj1} {noun1} beside {adj2} {noun2}.",
    "Affordable {adj1} {noun1} above {num}.",
    "New {adj1} {noun1} for {adj2} {noun2}.",
    "Order {adj1} {noun1} under {num}.",
    "Need {adj1} {noun1} with {adj2} {noun2}.",
    "Searching {adj1} {noun1} for {adj2} {noun2}.",
    "Get {adj1} {noun1} under {num}.",
    "Looking for {adj1} {noun1} without {adj2} {noun2}.",
    "Purchase {adj1} {noun1} for {adj2} {noun2}.",
    "I want {adj1} {noun1} above {num}.",
    "Find {adj1} {noun1} beside {adj2} {noun2}.",
    "Buy {adj1} {noun1} without {adj2} {noun2}.",
    "Affordable {adj1} {noun1} with {adj2} {noun2}.",
    "Order {adj1} {noun1} for {adj2} {noun2}.",
    "Looking for {adj1} {noun1} beside {adj2} {noun2}.",
    "Purchase {adj1} {noun1} above {num}.",
    "Need {adj1} {noun1} without {adj2} {noun2}.",
    "Cheap {adj1} {noun1} for {adj2} {noun2}."
]

test_dataset = []

for template in templates:
    adj1 = random.choice(adjs)
    adj2 = random.choice(adjs)
    noun1 = random.choice(nouns)
    noun2 = random.choice(nouns)
    num = random.choice(nums)

    sentence = template.format(adj1=adj1, adj2=adj2, noun1=noun1, noun2=noun2, num=num)

    # Basic rule-based expected output logic
    expected = {}

    if " with " in template and " for " not in template:
        expected = {
            "Primary main noun": noun1,
            "Primary prev nouns": "",
            "Adj ref Primary noun": adj1,
            "ADP ref Primary noun": "",
            "Verb ref Primary noun": "",
            "Secondary main noun": noun2,
            "Secondary next nouns": "",
            "Adj ref Secondary noun": adj2,
            "ADP ref Secondary noun": "",
            "Verb ref Secondary noun": ""
        }
    elif " for " in template:
        expected = {
            "Primary main noun": noun2,
            "Primary prev nouns": "",
            "Adj ref Primary noun": adj2,
            "ADP ref Primary noun": "",
            "Verb ref Primary noun": "",
            "Secondary main noun": noun1,
            "Secondary next nouns": "",
            "Adj ref Secondary noun": adj1,
            "ADP ref Secondary noun": "",
            "Verb ref Secondary noun": ""
        }
    elif " under " in template or " above " in template:
        adp = "under" if " under " in template else "above"
        expected = {
            "Primary main noun": noun1,
            "Primary prev nouns": "",
            "Adj ref Primary noun": adj1,
            "ADP ref Primary noun": adp,
            "Verb ref Primary noun": "",
            "Secondary main noun": num,
            "Secondary next nouns": "",
            "Adj ref Secondary noun": "",
            "ADP ref Secondary noun": "",
            "Verb ref Secondary noun": ""
        }
    else:
        expected = {
            "Primary main noun": noun1,
            "Primary prev nouns": "",
            "Adj ref Primary noun": adj1,
            "ADP ref Primary noun": "",
            "Verb ref Primary noun": "",
            "Secondary main noun": noun2,
            "Secondary next nouns": "",
            "Adj ref Secondary noun": adj2,
            "ADP ref Secondary noun": "",
            "Verb ref Secondary noun": ""
        }

    test_dataset.append({
        "text": sentence,
        "expected": expected
    })

# Save as JSON
with open("generated_ground_truth_dataset.json", "w") as f:
    json.dump(test_dataset, f, indent=4)

print("Generated 50 ground truth examples saved to 'generated_ground_truth_dataset.json'")


Generated 50 ground truth examples saved to 'generated_ground_truth_dataset.json'


In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

# Define connectors with grammar-based rules
left_primary = {"with", "for", "under","by","on"}
right_primary = { }

noun_pos_tags = {"NOUN", "PROPN", "NUM"}

def extract_info(text):
    doc = nlp(text)

    primary = {"main_noun": "", "prev_nouns": "", "adj": "", "adp": ""}
    secondary = {"main_noun": "", "next_nouns": "", "adj": "", "adp": ""}

    connector_token = None

    # Find connector
    for token in doc:
        if token.text.lower() in left_primary.union(right_primary) and token.pos_ == "ADP":
            connector_token = token
            break

    if connector_token:
        left_tokens = list(doc[:connector_token.i])
        right_tokens = list(doc[connector_token.i + 1:])

        left_main_noun, left_prev_nouns = "", ""
        right_main_noun, right_next_nouns = "", ""

        # Find left main noun & previous consecutive nouns
        for i in reversed(range(len(left_tokens))):
            token = left_tokens[i]
            if token.pos_ in noun_pos_tags:
                left_main_noun = token.text
                prev_nouns = []
                for j in reversed(range(i)):
                    if left_tokens[j].pos_ in noun_pos_tags:
                        prev_nouns.insert(0, left_tokens[j].text)
                    else:
                        break
                left_prev_nouns = " ".join(prev_nouns)
                break

        # Find right main noun & next consecutive nouns
        for i, token in enumerate(right_tokens):
            if token.pos_ in noun_pos_tags:
                right_main_noun = token.text
                next_nouns = []
                for j in range(i + 1, len(right_tokens)):
                    if right_tokens[j].pos_ in noun_pos_tags:
                        next_nouns.append(right_tokens[j].text)
                    else:
                        break
                right_next_nouns = " ".join(next_nouns)
                break

        connector = connector_token.text.lower()

        if connector in left_primary:
            primary["main_noun"] = left_main_noun
            primary["prev_nouns"] = left_prev_nouns
            secondary["main_noun"] = right_main_noun
            secondary["next_nouns"] = right_next_nouns
        elif connector in right_primary:
            primary["main_noun"] = right_main_noun
            primary["prev_nouns"] = right_next_nouns
            secondary["main_noun"] = left_main_noun
            secondary["next_nouns"] = left_prev_nouns

        # Find adjectives, adpositions, verbs referring to nouns
        for token in doc:
            if token.head.text == primary["main_noun"]:
                if token.pos_ == "ADJ":
                    primary["adj"] = token.text
                if token.pos_ == "ADP":
                    primary["adp"] = token.text

            if token.head.text == secondary["main_noun"]:
                if token.pos_ == "ADJ":
                    secondary["adj"] = token.text
                if token.pos_ == "ADP":
                    secondary["adp"] = token.text


    return {
        "Primary main noun": primary["main_noun"],
        "Primary prev nouns": primary["prev_nouns"],
        "Adj ref Primary noun": primary["adj"],
        "ADP ref Primary noun": primary["adp"],
        "Secondary main noun": secondary["main_noun"],
        "Secondary next nouns": secondary["next_nouns"],
        "Adj ref Secondary noun": secondary["adj"],
        "ADP ref Secondary noun": secondary["adp"],
    }

# Test Example
text = "deals on galaxy s24"
result = extract_info(text)

# Pretty print
for key, value in result.items():
    print(f"{key}: {value if value else '-'}")
