In [None]:
# Tool: https://spacy.io/usage
# Tutorial: https://realpython.com/natural-language-processing-spacy-python/

# 1. Install and Set Up spaCy
pip3 install spacy
# 2. Download a pre-trained English model 
python -m spacy download en_core_web_sm

# more accurate model
python -m spacy download en_core_web_trf

In [None]:

import spacy

# 3. loading the language model
nlp = spacy.load("en_core_web_trf")

# process user input (the search query)
 ## This involves tokenizing the text, 
 ## identifying part-of-speech (POS) tags, 
 ## and named entities (like products, colors, and prices).

# Example user input
user_query = "I'm looking for a red dress under 100 EUR"
# user_query = "Find me a jacket under $50"
# user_query ="Show me blue shoes for running"

#user_query = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# 4. Process the text with the spaCy pipeline
doc = nlp(user_query)

# Tokenization
# 5. Extract tokens and filter out stopwords

tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
print(tokens)
# Output: ['looking', 'red', 'dress', '100']

# for token in doc:
#     # Get the token text, part-of-speech tag and dependency label
#     token_text = token.text
#     token_pos = token.pos_
#     token_dep = token.dep_
#     # This is for formatting only
#     print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

# Named Entity Recognition (NER)
# 6. extract key entities from the query using spaCy’s NER feature
# example: product type ("dress"), color ("red"), and price ("$100")

# Access the recognized named entities
for ent in doc.ents:
    print(ent.text, ent.label_)
# Output:
# red COLOR
# $100 MONEY

# Rule-based matching: create custom entity rules
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Define a pattern for matching product types like 'dress'
pattern = [{"LOWER": "dress"}]

matcher.add("PRODUCT_TYPE", [pattern])

# Apply the matcher to the doc
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print("PRODUCT_TYPE: ",matched_span.text)

In [None]:
# TODO:
# Train a custom NER model.
# create more custom entity rules
# Add synonyms handling: 
#     For example, recognize that "cheap" means a "low price", or handle brand names.
# Use dependency parsing to better understand relationships in the query.
#     e.g., "under $100" refers to price.

# Lowercases the text
# Lemmatizes each token
# Removes punctuation symbols
# Removes stop words