## Cleaner Version of NLP_3
This Notebook is going to better use NLP to grab more aroma charectoristics. Started with 652 blanks + a ton of them had light amount of descriptors. Aroma + Flavor will be our two main NLP charectoristics we use to pair wine recommendations.

In [1]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher

nlp = spacy.load("en_core_web_sm")

In [2]:
#Our hardecoded flavor keyword pairing
FLAVOR_LEXICON = {
    "fruity": [
        "fruity", "fruit", "berry", "berry-like", "cherry", "strawberry", "blueberry",
        "raspberry", "blackberry", "citrus", "lemon", "lime", "grapefruit", "tropical",
        "apple", "pear"],
    "spicy": [
        "spicy", "peppery", "cinnamon", "clove", "pepper", "nutmeg", "ginger", "cardamom", "anise"],
    "earthy": [
        "earthy", "mushroom", "forest floor", "damp soil", "loamy"],
    "floral": [
        "floral", "violet", "rose", "lilac", "blossom", "aromatic", "perfumed"],
    "fresh": [
        "fresh", "crisp", "lively", "zesty", "vibrant"],
    "oaky": [
        "oaky", "oak", "vanilla", "toast", "woody", "cedar"],
    "sweet": [
        "sweet", "honeyed", "candied", "sugary"],
    "tannic": [
        "tannic", "firm tannins", "grippy"],
    "delicate": [
        "delicate", "light", "airy", "elegant", "subtle"],
    "rich": [
        "rich", "full-bodied", "lush", "opulent"],
    "mineral": [
        "mineral", "flinty", "stony", "chalky", "slate", "gunflint"],
    "herbaceous": [
        "herbaceous", "herbal", "herb", "sage", "rosemary", "thyme", "basil", "mint"],
    "acidic": [
        "acidic", "tart", "tangy", "sour", "sharp"],
    "buttery": [
        "buttery", "creamy", "lactic", "butter", "rich cream"],
    "savory": [
        "savory", "umami", "meaty", "brothy", "salty-savory"],
    "smoky": [
        "smoky", "smoke", "charred", "ash", "roasted", "burnt"],
    "salty": [
        "salty", "briny", "oceanic", "saline"]
}

In [5]:
COLOR_TERMS = ["red", "white", "rosé", "ruby", "garnet", "straw", "gold", "purple", "deep", "bright"]
AROMA_TERMS = ["aroma", "aromas", "nose", "bouquet", "smell"]
FLAVOR_TERMS = ["flavor", "taste", "notes", "profile", "palate"]
FINISH_TERMS = ["finish", "length", "aftertaste"]
MOUTHFEEL_TERMS = ["mouthfeel", "texture", "tannin", "tannins", "acidity", "palate", "body"]

sample_description = (
    "Bright red color and initial aromas lemon dominated by a spicy note that, coupled "
    "with fruity notes of cherry and strawberry, creates a delightful bouquet. "
    "On the Lemon palate it is fresh and delicate, with medium tannins and a persistent finish."
)

doc = nlp(sample_description)

In [6]:
# --------------------------------------------------------------------------------
# 1) CREATE A DICTIONARY OF WINE DESCRIPTIONS FROM OUR CSV
# --------------------------------------------------------------------------------

import pandas as pd

# Load Johnathon's CSV file into a DataFrame
df_aroma_wheel = pd.read_csv("../../Resources/wine_aroma_wheel_final.csv")

# Create a nested dictionary from the DataFrame
AROMA_LEXICON = {}

for _, row in df_aroma_wheel.iterrows():
    category = row["Category"]
    subcategory = row["Subcategory"]
    attributes = row['Outer Ring Attributes'].split(", ")

    if category not in AROMA_LEXICON:
        AROMA_LEXICON[category] = {}
    AROMA_LEXICON[category][subcategory] = attributes

In [7]:
# --------------------------------------------------------------------------------
# 3) BUILD MATCH PATTERNS
# --------------------------------------------------------------------------------
matcher = Matcher(nlp.vocab)

# A) ADJ + NOUN (e.g., "spicy notes", "fresh acidity", "persistent finish")
pattern_adj_noun = [
    {"POS": "ADJ"},         # e.g., "spicy"
    {"POS": "NOUN"}         # e.g., "notes"
]
matcher.add("ADJ_NOUN", [pattern_adj_noun])

# B) NOUN + NOUN (e.g., "berry aroma", "cherry nose", "fruit notes")
pattern_noun_noun = [
    {"POS": "NOUN"},
    {"POS": "NOUN"}
]
matcher.add("NOUN_NOUN", [pattern_noun_noun])

# You could add more patterns, such as ADJ + ADJ + NOUN, or "ADJ + 'and' + ADJ + NOUN", etc.

pattern_adj_adj_noun = [
    {"POS": "ADJ"},
    {"POS": "ADJ"},
    {"POS": "NOUN"}
]

matcher.add("ADJ_ADJ_NOUN", [pattern_adj_adj_noun])

matches = matcher(doc)

In [8]:
import re

# (Optional) Define a set of ignore words based on your frequency analysis.
IGNORE_WORDS = {}
#IGNORE_WORDS = {"mouth", "notes", "end", "wine", "color", "well", "length"}

# Updated get_aroma_category using regex for whole-word matching.
def get_aroma_category(token_text):
    token_text_lower = token_text.lower()
    for category, subcategories in AROMA_LEXICON.items():
        for subcategory, attributes in subcategories.items():
            for attribute in attributes:
                # Construct a pattern that matches the attribute as a whole word.
                pattern = r'\b' + re.escape(attribute.lower()) + r'\b'
                if re.search(pattern, token_text_lower):
                    return category, subcategory, attribute
    return None, None, None

# Updated extract_aroma_profile function.
def extract_aroma_profile(review_notes):
    # Process the review note with spaCy.
    doc = nlp(review_notes)
    matches = matcher(doc)
    
    aromas = []
    categories = []
    subcategories = []
    outer_ring_attributes = []

    # First pass: use the spaCy matcher to find candidate phrases.
    for match_id, start, end in matches:
        span = doc[start:end]
        # Create a local context: a few tokens before and after the span.
        local_context_tokens = ([t.text.lower() for t in doc[max(0, start-3):start]] +
                                [t.text.lower() for t in span] +
                                [t.text.lower() for t in doc[end:end+3]])
        # Filter out tokens that are in the ignore list.
        local_context_tokens = [token for token in local_context_tokens if token not in IGNORE_WORDS]

        # Attempt to map the span to an aroma category via the lexicon.
        category, subcategory, attribute = get_aroma_category(span.text)
        if category is not None:
            aromas.append(attribute)
            categories.append(category)
            subcategories.append(subcategory)
            outer_ring_attributes.append(attribute)
        else:
            # If no direct match is found, add the span text if it isn’t an ignore word.
            span_text = span.text.lower().strip()
            if span_text not in IGNORE_WORDS:
                aromas.append(span_text)

    # Fallback: If no aromas were detected by the matcher, check token-by-token.
    if not aromas:
        for token in doc:
            token_text = token.text.lower()
            # Only consider alphabetic tokens and skip common filler words.
            if token.is_alpha and token_text not in IGNORE_WORDS:
                cat, subcat, attr = get_aroma_category(token_text)
                if cat is not None:
                    aromas.append(attr)
                    categories.append(cat)
                    subcategories.append(subcat)
                    outer_ring_attributes.append(attr)
    
    # Remove duplicates before returning the results.
    aromas = list(set(aromas))
    categories = list(set(categories))
    subcategories = list(set(subcategories))
    outer_ring_attributes = list(set(outer_ring_attributes))
    
    return {
        "aromas": aromas,
        "categories": categories,
        "subcategories": subcategories,
        "outer_ring_attributes": outer_ring_attributes
    }

# Part 2
Updated the Aroma aspects of our Dataset with our previous parsed data from nlp_2

In [9]:
#now lets create a csv of ALL THE WINES + WineProfiles included!! 
import pandas as pd

#Import the CSV (TRANSLATED!!)
starting_file = '../../Resources/BlogOsVinhosTranslated_With_WineProfile.csv'

df_start = pd.read_csv(starting_file)
df_start.head(5)

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']"
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins']
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity']


In [10]:
# Apply the function to each row in the DataFrame
df_start['aroma_profile'] = df_start['Review_Notes'].apply(extract_aroma_profile)

In [None]:
# Expand the aroma_profile dictionary into separate columns
df_aroma_profile = df_start['aroma_profile'].apply(pd.Series)


In [None]:
# Join the new columns to the original DataFrame with suffixes to avoid overlap
df_start = df_start.join(df_aroma_profile, rsuffix='_aroma')


df.head()

In [22]:
df_start.head()

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel,aromas_aroma,categories,subcategories,outer_ring_attributes
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[],"[persistent flavor, fruity notes, red color, i...",[],[],[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']","[fresh set, seductive texture, subtle barrel, ...",[],[],[]
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins'],"[Violet, delicate spicy notes, very fruity, sp...",[Floral],[Colored Flowers],[Violet]
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[],"[bright yellow color, floral touch, tropical f...",[],[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity'],"[well acidity, suggestive vegetable, floral no...",[Maturation in Oak Barrel],[Spices],[Vanilla]


In [13]:
df_start.to_csv('../../Resources/nlp_3_aroma_missing_652.csv', index=False)

In [14]:
df_start.columns

Index(['Name', 'Color', 'Alcohol_Percentage', 'Judge_Rating', 'Review_Notes',
       'Wine_Bottle_Label', 'URL', 'Price', 'color', 'aromas', 'flavors',
       'finish', 'mouthfeel', 'aroma_profile'],
      dtype='object')

In [15]:
import re

# (Optional) Define a set of ignore words based on your frequency analysis.
IGNORE_WORDS = {}
#IGNORE_WORDS = {"mouth", "notes", "end", "wine", "color", "well", "length"}

# Updated get_aroma_category using regex for whole-word matching.
def get_aroma_category(token_text):
    token_text_lower = token_text.lower()
    for category, subcategories in AROMA_LEXICON.items():
        for subcategory, attributes in subcategories.items():
            for attribute in attributes:
                # Construct a pattern that matches the attribute as a whole word.
                pattern = r'\b' + re.escape(attribute.lower()) + r'\b'
                if re.search(pattern, token_text_lower):
                    return category, subcategory, attribute
    return None, None, None

# Updated extract_aroma_profile function.
def extract_aroma_profile(review_notes):
    # Process the review note with spaCy.
    doc = nlp(review_notes)
    matches = matcher(doc)
    
    aromas = []
    categories = []
    subcategories = []
    outer_ring_attributes = []

    # First pass: use the spaCy matcher to find candidate phrases.
    for match_id, start, end in matches:
        span = doc[start:end]
        # Create a local context: a few tokens before and after the span.
        local_context_tokens = ([t.text.lower() for t in doc[max(0, start-3):start]] +
                                [t.text.lower() for t in span] +
                                [t.text.lower() for t in doc[end:end+3]])
        # Filter out tokens that are in the ignore list.
        local_context_tokens = [token for token in local_context_tokens if token not in IGNORE_WORDS]

        # Attempt to map the span to an aroma category via the lexicon.
        category, subcategory, attribute = get_aroma_category(span.text)
        if category is not None:
            aromas.append(attribute)
            categories.append(category)
            subcategories.append(subcategory)
            outer_ring_attributes.append(attribute)
        else:
            # If no direct match is found, add the span text if it isn’t an ignore word.
            span_text = span.text.lower().strip()
            if span_text not in IGNORE_WORDS:
                aromas.append(span_text)

    # Fallback: If no aromas were detected by the matcher, check token-by-token.
    if not aromas:
        for token in doc:
            token_text = token.text.lower()
            # Only consider alphabetic tokens and skip common filler words.
            if token.is_alpha and token_text not in IGNORE_WORDS:
                cat, subcat, attr = get_aroma_category(token_text)
                if cat is not None:
                    aromas.append(attr)
                    categories.append(cat)
                    subcategories.append(subcat)
                    outer_ring_attributes.append(attr)
    
    # Remove duplicates before returning the results.
    aromas = list(set(aromas))
    categories = list(set(categories))
    subcategories = list(set(subcategories))
    outer_ring_attributes = list(set(outer_ring_attributes))
    
    return {
        "aromas": aromas,
        "categories": categories,
        "subcategories": subcategories,
        "outer_ring_attributes": outer_ring_attributes
    }


In [16]:
# Example review note (adjust the text as needed)
sample_review = "The wine presents a vibrant nose with hints of ripe cherry, a touch of oak, and subtle spice. It also offers delicate floral and mineral nuances."

# Extract the aroma profile using the updated function
output_profile = extract_aroma_profile(sample_review)

# Print the output (using pprint for nicer formatting)
from pprint import pprint
print("Extracted aroma profile:")
pprint(output_profile)


Extracted aroma profile:
{'aromas': ['vibrant nose', 'mineral nuances', 'Cherry', 'subtle spice'],
 'categories': ['Fruity Red Wine'],
 'outer_ring_attributes': ['Cherry'],
 'subcategories': ['Stone Fruits']}


In [17]:
#now lets create a csv of ALL THE WINES + WineProfiles included!! 
import pandas as pd

#Import the CSV (TRANSLATED!!)
starting_file = '../../Resources/BlogOsVinhosTranslated_With_WineProfile.csv'

df_start = pd.read_csv(starting_file)
df_start.head(5)

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']"
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins']
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity']


In [18]:
# Apply the function to each row in the DataFrame
df_start['aroma_profile'] = df_start['Review_Notes'].apply(extract_aroma_profile)



In [19]:
# Expand the aroma_profile dictionary into separate columns
df_aroma_profile = df_start['aroma_profile'].apply(pd.Series)

# Join the new columns to the original DataFrame with suffixes to avoid overlap
df_start = df_start.join(df_aroma_profile, rsuffix='_aroma')

# Drop the original aroma_profile column
df_start.drop(columns=['aroma_profile'], inplace=True)


In [20]:
# List of columns to check for null values
columns_to_check = ["aromas_aroma"]

# Function to check for empty lists
def count_empty_lists(column):
    return column.apply(lambda x: len(x) == 0).sum()

# Sum the empty lists in each column
empty_list_counts = df_start[columns_to_check].apply(count_empty_lists)

# Display the result
print(empty_list_counts)

df_start.columns


aromas_aroma    7
dtype: int64


Index(['Name', 'Color', 'Alcohol_Percentage', 'Judge_Rating', 'Review_Notes',
       'Wine_Bottle_Label', 'URL', 'Price', 'color', 'aromas', 'flavors',
       'finish', 'mouthfeel', 'aromas_aroma', 'categories', 'subcategories',
       'outer_ring_attributes'],
      dtype='object')

In [21]:
df_start.to_csv('../../Resources/nlp_3.1_final.csv', index=False)