## more advanced version of nlp_1

In [71]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher

nlp = spacy.load("en_core_web_sm")

In [72]:
sample_description = (
    "Bright red color and initial aromas lemon dominated by a spicy note that, coupled "
    "with fruity notes of cherry and strawberry, creates a delightful bouquet. "
    "On the Lemon palate it is fresh and delicate, with medium tannins and a persistent finish."
)

doc = nlp(sample_description)

In [73]:
FLAVOR_LEXICON = {
    "fruity": [
        "fruity", "fruit", "berry", "berry-like", "cherry", "strawberry", "blueberry",
        "raspberry", "blackberry", "citrus", "lemon", "lime", "grapefruit", "tropical",
        "apple", "pear"],
    "spicy": [
        "spicy", "peppery", "cinnamon", "clove", "pepper", "nutmeg", "ginger", "cardamom", "anise"],
    "earthy": [
        "earthy", "mushroom", "forest floor", "damp soil", "loamy"],
    "floral": [
        "floral", "violet", "rose", "lilac", "blossom", "aromatic", "perfumed"],
    "fresh": [
        "fresh", "crisp", "lively", "zesty", "vibrant"],
    "oaky": [
        "oaky", "oak", "vanilla", "toast", "woody", "cedar"],
    "sweet": [
        "sweet", "honeyed", "candied", "sugary"],
    "tannic": [
        "tannic", "firm tannins", "grippy"],
    "delicate": [
        "delicate", "light", "airy", "elegant", "subtle"],
    "rich": [
        "rich", "full-bodied", "lush", "opulent"],
    "mineral": [
        "mineral", "flinty", "stony", "chalky", "slate", "gunflint"],
    "herbaceous": [
        "herbaceous", "herbal", "herb", "sage", "rosemary", "thyme", "basil", "mint"],
    "acidic": [
        "acidic", "tart", "tangy", "sour", "sharp"],
    "buttery": [
        "buttery", "creamy", "lactic", "butter", "rich cream"],
    "savory": [
        "savory", "umami", "meaty", "brothy", "salty-savory"],
    "smoky": [
        "smoky", "smoke", "charred", "ash", "roasted", "burnt"],
    "salty": [
        "salty", "briny", "oceanic", "saline"]
}

In [74]:
COLOR_TERMS = ["red", "white", "rosé", "ruby", "garnet", "straw", "gold", "purple", "deep", "bright"]
AROMA_TERMS = ["aroma", "aromas", "nose", "bouquet", "smell"]
FLAVOR_TERMS = ["flavor", "taste", "notes", "profile", "palate"]
FINISH_TERMS = ["finish", "length", "aftertaste"]
MOUTHFEEL_TERMS = ["mouthfeel", "texture", "tannin", "tannins", "acidity", "palate", "body"]

In [75]:
# --------------------------------------------------------------------------------
# 1) CREATE A DICTIONARY OF WINE DESCRIPTIONS FROM OUR CSV
# --------------------------------------------------------------------------------

# Updated - use Johnathon's Wine Aroma Wheel
import pandas as pd

# Load the CSV file into a DataFrame
df_aroma_wheel = pd.read_csv("../../Resources/wine_aroma_wheel_final.csv")

# Create a nested dictionary from the DataFrame
AROMA_LEXICON = {}

for _, row in df_aroma_wheel.iterrows():
    category = row["Category"]
    subcategory = row["Subcategory"]
    attributes = row['Outer Ring Attributes'].split(", ")

    if category not in AROMA_LEXICON:
        AROMA_LEXICON[category] = {}
    AROMA_LEXICON[category][subcategory] = attributes

# --------------------------------------------------------------------------------
# 2) Remade FUNCTION: MAP A TOKEN OR PHRASE TO A KNOWN AROMA CATEGORY
# --------------------------------------------------------------------------------
def get_aroma_category(token_text):
    token_text_lower = token_text.lower()
    for category, subcategories in AROMA_LEXICON.items():
        for subcategory, attributes in subcategories.items():
            for attribute in attributes:
                if attribute.lower() in token_text_lower:
                    return category, subcategory, attribute
    return None, None, None

print(AROMA_LEXICON)

{'Fruity White Wine': {'Citrus': ['Lemon', 'Lime', 'Grapefruit'], 'White Berries': ['Gooseberry'], 'Pome Fruits': ['Pear', 'Apple', 'Green Apple'], 'Stone Fruits': ['Peach'], 'Tropical Fruits': ['Melon', 'Guava', 'Pineapple', 'Passion Fruit', 'Lychee'], 'Botrytized': ['Dried Apricot', 'Orange Peel']}, 'Fruity Red Wine': {'Tropical Fruits': ['Banana'], 'Red Berries': ['Raspberry', 'Blackcurrant', 'Strawberry', 'Blackberry'], 'Stone Fruits': ['Cherry', 'Plum'], 'Fortified': ['Prune']}, 'Floral': {'White Flowers': ['Honeysuckle', 'Hawthorn', 'Orange Blossom', 'Linden', 'Jasmine', 'Acacia'], 'Colored Flowers': ['Rose', 'Lavender', 'Violet']}, 'Vegetal': {'Vegetables': ['Capsicum', 'Fennel', 'Tomato', 'Green Bell Pepper'], 'Fresh Herbs': ['Cut Grass', 'Dill', 'Thyme', 'Fern', 'Mint', 'Rosemary', 'Sage', 'Basil'], 'Dried Herbs': ['Hay', 'Black Tea', 'Tobacco'], 'Leaves': ['Black Currant Leaf', 'Bay Leaf', 'Eucalyptus']}, 'Maturation in Oak Barrel': {'Woods': ['Pine', 'Cedar', 'Sandalwood', '

In [76]:
""" Example Output 
{'Fruity White Wine': 
{'Citrus': ['Lemon', 'Lime', 'Grapefruit'], 
'White Berries': ['Gooseberry'],
'Pome Fruits': ['Pear', 'Apple', 'Green Apple'], 
'Stone Fruits': ['Peach'], 
'Tropical Fruits': ['Melon', 'Guava', 'Pineapple', 'Passion Fruit', 'Lychee'], 
'Botrytized': ['Dried Apricot', 'Orange Peel']},

'Fruity Red Wine': 
{'Tropical Fruits': ['Banana'], 
'Red Berries': ['Raspberry', 'Blackcurrant', 'Strawberry', 'Blackberry'], 
'Stone Fruits': ['Cherry', 'Plum'], 'Fortified': ['Prune']}, 
'Floral': {'White Flowers': ['Honeysuckle', 'Hawthorn', 'Orange Blossom', 'Linden', 'Jasmine', 'Acacia'], 
'Colored Flowers': ['Rose', 'Lavender', 'Violet']}, 

'Vegetal': 
{'Vegetables': ['Capsicum', 'Fennel', 'Tomato', 'Green Bell Pepper'], 
'Fresh Herbs': ['Cut Grass', 'Dill', 'Thyme', 'Fern', 'Mint', 'Rosemary', 'Sage', 'Basil'],
 'Dried Herbs': ['Hay', 'Black Tea', 'Tobacco'], 
 'Leaves': ['Black Currant Leaf', 'Bay Leaf', 'Eucalyptus']}, 
 
 'Maturation in Oak Barrel': 
 {'Woods': ['Pine', 'Cedar', 'Sandalwood', 'Oak'], 
 'Nuts': ['Almond', 'Hazelnut', 'Coconut', 'Walnut'], 
 'Spices': ['Cloves', 'Nutmeg', 'Liquorice', 'Cinnamon', 'Pepper', 'Vanilla', 'Anise'], 
 'Toasted': ['Tar', 'Smoke', 'Bacon', 'Coffee', 'Toast', 'Chocolate', 'Caramel', 'Brioche', 'Butterscotch', 'Charred', 'Roasted']}, 
 
 'Wine Faults':
{'Brett': ['Horse Sweat'], 
'Sulfites': ['Sweet Corn', 'Onion', 'Rubber'], 
'Volatile Acidity': ['Nail Polish Remover'], 
'Heat': ['Madeira'], 'Oxygen': ['Sherry'], 
'Cork Taint': ['Corked']}, 

'Mineral': 
{'Stony': ['Flint', 'Wet Stone', 'Chalk', 'Slate', 'Gunflint', 'Crushed Rock']}, 

'Earthy': 
{'Earth': ['Mushroom', 'Forest Floor', 'Damp Earth', 'Truffle', 'Dusty', 'Loamy'],
'Animal': ['Barnyard', 'Leather']}, 

'Chemical': 
{'Reduction': ['Petrol', 'Cabbage'], 
'Chem': ['Solvent', 'Acetone']}}

"""

" Example Output \n{'Fruity White Wine': \n{'Citrus': ['Lemon', 'Lime', 'Grapefruit'], \n'White Berries': ['Gooseberry'],\n'Pome Fruits': ['Pear', 'Apple', 'Green Apple'], \n'Stone Fruits': ['Peach'], \n'Tropical Fruits': ['Melon', 'Guava', 'Pineapple', 'Passion Fruit', 'Lychee'], \n'Botrytized': ['Dried Apricot', 'Orange Peel']},\n\n'Fruity Red Wine': \n{'Tropical Fruits': ['Banana'], \n'Red Berries': ['Raspberry', 'Blackcurrant', 'Strawberry', 'Blackberry'], \n'Stone Fruits': ['Cherry', 'Plum'], 'Fortified': ['Prune']}, \n'Floral': {'White Flowers': ['Honeysuckle', 'Hawthorn', 'Orange Blossom', 'Linden', 'Jasmine', 'Acacia'], \n'Colored Flowers': ['Rose', 'Lavender', 'Violet']}, \n\n'Vegetal': \n{'Vegetables': ['Capsicum', 'Fennel', 'Tomato', 'Green Bell Pepper'], \n'Fresh Herbs': ['Cut Grass', 'Dill', 'Thyme', 'Fern', 'Mint', 'Rosemary', 'Sage', 'Basil'],\n 'Dried Herbs': ['Hay', 'Black Tea', 'Tobacco'], \n 'Leaves': ['Black Currant Leaf', 'Bay Leaf', 'Eucalyptus']}, \n \n 'Maturati

In [77]:
# --------------------------------------------------------------------------------
# 3) BUILD MATCH PATTERNS
# --------------------------------------------------------------------------------
matcher = Matcher(nlp.vocab)

# A) ADJ + NOUN (e.g., "spicy notes", "fresh acidity", "persistent finish")
pattern_adj_noun = [
    {"POS": "ADJ"},         # e.g., "spicy"
    {"POS": "NOUN"}         # e.g., "notes"
]
matcher.add("ADJ_NOUN", [pattern_adj_noun])

# B) NOUN + NOUN (e.g., "berry aroma", "cherry nose", "fruit notes")
pattern_noun_noun = [
    {"POS": "NOUN"},
    {"POS": "NOUN"}
]
matcher.add("NOUN_NOUN", [pattern_noun_noun])

# You could add more patterns, such as ADJ + ADJ + NOUN, or "ADJ + 'and' + ADJ + NOUN", etc.

pattern_adj_adj_noun = [
    {"POS": "ADJ"},
    {"POS": "ADJ"},
    {"POS": "NOUN"}
]

matcher.add("ADJ_ADJ_NOUN", [pattern_adj_adj_noun])

matches = matcher(doc)

In [27]:
# --------------------------------------------------------------------------------
# 4) COLLECT RESULTS
# --------------------------------------------------------------------------------

def extract_aroma_profile(review_notes):
    doc = nlp(review_notes)
    matches = matcher(doc)
    
    aromas = []
    categories = []
    subcategories = []
    outer_ring_attributes = []

    for match_id, start, end in matches:
        span = doc[start:end]
        phrase_text = span.text.lower().split()
        local_context_tokens = [t.text.lower() for t in doc[max(0, start-3):start]] + [t.text.lower() for t in span] + [t.text.lower() for t in doc[end:end+3]]
        
        if any(term in local_context_tokens for term in AROMA_TERMS):
            category, subcategory, attribute = get_aroma_category(span.text)
            if category is not None:
                aromas.append(attribute)
                categories.append(category)
                subcategories.append(subcategory)
                outer_ring_attributes.append(attribute)
            else:
                aromas.append(span.text)

    aromas = list(set(aromas))
    
    return {
        "aromas": aromas,
        "categories": list(set(categories)),
        "subcategories": list(set(subcategories)),
        "outer_ring_attributes": list(set(outer_ring_attributes))
    }


In [28]:
#check the function output
val = extract_aroma_profile(sample_description)
print(val)

{'aromas': ['delightful bouquet', 'Lemon', 'Bright red color', 'red color', 'initial aromas'], 'categories': ['Fruity White Wine'], 'subcategories': ['Citrus'], 'outer_ring_attributes': ['Lemon']}


In [29]:
#Initial Input
sample_description_input = (
    "Bright red color and initial aromas lemon dominated by a spicy note that, coupled "
    "with fruity notes of cherry and strawberry, creates a delightful bouquet. "
    "On the Lemon palate it is fresh and delicate, with medium tannins and a persistent finish."
)

#Sample Output
sample_output = {
 'color': ['red', 'Bright'], 
 'aromas': ['delicate', 'initial aromas', 'red color'], 
 'flavors': ['fruity', 'spicy'], 
 'finish': ['persistent finish'], 
 'mouthfeel': ['medium tannins']
 }

sample_output = {
'color': ['red', 'Bright'], 
'aromas': ['delightful bouquet', 'Lemon', 'Bright red color', 'red color', 'initial aromas'], 
'flavors': ['spicy note', 'fruity notes'], 
'finish': ['persistent finish'], 
'mouthfeel': ['medium tannins'], 
'aroma_categories': ['Fruity White Wine'], 
'aroma_subcategories': ['Citrus'], 
'aroma_outer_ring_attributes': ['Lemon']}

sample_aroma_output= {
'aromas': ['delightful bouquet', 'Lemon', 'Bright red color', 'red color', 'initial aromas'], 
'categories': ['Fruity White Wine'], 
'subcategories': ['Citrus'], 
'outer_ring_attributes': ['Lemon']}

## Part 2
Update the Aroma aspects of our dataset with our previously parsed data in nlp_2

In [78]:
#now lets create a csv of ALL THE WINES + WineProfiles included!! 
import pandas as pd

#Import the CSV (TRANSLATED!!)
starting_file = '../../Resources/BlogOsVinhosTranslated_With_WineProfile.csv'

df_start = pd.read_csv(starting_file)
df_start.head(5)

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']"
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins']
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity']


In [None]:
# Apply the function to each row in the DataFrame
df_start['aroma_profile'] = df_start['Review_Notes'].apply(extract_aroma_profile)



In [33]:
# Expand the aroma_profile dictionary into separate columns
df_aroma_profile = df_start['aroma_profile'].apply(pd.Series)


In [34]:
# Join the new columns to the original DataFrame with suffixes to avoid overlap
df_start = df_start.join(df_aroma_profile, rsuffix='_aroma')


In [35]:
# Drop the original aroma_profile column
df_start.drop(columns=['aroma_profile'], inplace=True)


In [36]:
# List of columns to check for null values
columns_to_check = ["aromas_aroma"]

In [37]:
# Function to check for empty lists
def count_empty_lists(column):
    return column.apply(lambda x: len(x) == 0).sum()


In [38]:
# Sum the empty lists in each column
empty_list_counts = df_start[columns_to_check].apply(count_empty_lists)


In [39]:
# Display the result
print(empty_list_counts)

aromas_aroma    652
dtype: int64


In [43]:
df_start.head(5)

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel,aromas_aroma,categories,subcategories,outer_ring_attributes
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[],"[red color, initial aromas, Bright red color]",[],[],[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']",[aromatic nose],[],[],[]
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins'],"[red fruit, floral aromas, Violet]",[Floral],[Colored Flowers],[Violet]
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[],[],[],[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity'],"[yellow color, tropical fruit, mineral notes, ...",[],[],[]


In [42]:
df_start.head(5)

# Count nulls in df_start['categories', 'subcategories', 'outer_ring_attributes'], accounting for blank lists as null
null_counts = df_start[['categories', 'subcategories', 'outer_ring_attributes']].applymap(lambda x: len(x) == 0).sum()
print(null_counts)


categories               5900
subcategories            5900
outer_ring_attributes    5900
dtype: int64


  null_counts = df_start[['categories', 'subcategories', 'outer_ring_attributes']].applymap(lambda x: len(x) == 0).sum()


In [46]:
# List of columns to check for null values
columns_to_check = ["aromas"]

# Function to check for empty lists
def count_empty_lists(column):
    return column.apply(lambda x: len(x) == 0).sum()

# Sum the empty lists in each column
empty_list_counts = df_start[columns_to_check].apply(count_empty_lists)

# Display the result
print(empty_list_counts)

aromas    0
dtype: int64


In [48]:
# List of columns to check for empty lists
columns_to_check = ["aromas"]

# Function to check for empty lists
def has_empty_list(row):
    return any(len(row[col]) == 0 for col in columns_to_check)

# Apply the function to each row and count the rows with any empty lists
rows_with_empty_lists = df_start.apply(has_empty_list, axis=1).sum()

# Display the result
print(f"Number of rows with any empty lists in the specified columns: {rows_with_empty_lists}")

Number of rows with any empty lists in the specified columns: 0


In [50]:
df_start.to_csv('../../Resources/nlp_3_aroma_missing_652.csv', index=False)

In [52]:
df_start.columns

Index(['Name', 'Color', 'Alcohol_Percentage', 'Judge_Rating', 'Review_Notes',
       'Wine_Bottle_Label', 'URL', 'Price', 'color', 'aromas', 'flavors',
       'finish', 'mouthfeel', 'aromas_aroma', 'categories', 'subcategories',
       'outer_ring_attributes'],
      dtype='object')

# Ending of Notebook
Didn't really get a great bump from the aroma wheel. need to rethink how we parse for it


In [57]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from collections import Counter
import ast

# Ensure you've downloaded the necessary NLTK resources:
# nltk.download('punkt')
# nltk.download('stopwords')

# Function to determine if a cell is an empty list (or a string representation of one)
def is_empty_aroma(x):
    # If it's already a list
    if isinstance(x, list):
        return len(x) == 0
    # If it's a string, try converting it to a list
    try:
        lst = ast.literal_eval(x)
        return isinstance(lst, list) and len(lst) == 0
    except:
        return False

# Load your dataset (adjust the filename/path as needed)
df = pd.read_csv('../../Resources/nlp_3_aroma_missing_652.csv')

# Debug: Check the total number of rows in the dataset
print("Total rows in dataset:", len(df))

# Filter out the rows where 'aromas' is an empty list (or string representing an empty list)
blanks = df[df['aromas'].apply(is_empty_aroma)]['Review_Notes']

# Debug: Check the number of rows with empty aromas
print("Number of rows with empty aromas:", blanks.shape[0])

# Combine all blank text entries into one large text blob
text_data = ' '.join(blanks.dropna().tolist())
print("Combined text length:", len(text_data))

# Convert text to lowercase to standardize
text_data = text_data.lower()

# Remove punctuation using str.translate
translator = str.maketrans('', '', string.punctuation)
text_data = text_data.translate(translator)

# Tokenize the text into words
tokens = word_tokenize(text_data)

# Remove common English stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Optionally, remove non-alphabetic tokens and very short words (less than 3 letters)
filtered_tokens = [word for word in filtered_tokens if word.isalpha() and len(word) > 2]

# Count the frequency of each word
word_freq = Counter(filtered_tokens)

# Display the 20 most common words in the blank review notes
print("Most common words in blank descriptions:")
for word, count in word_freq.most_common(20):
    print(f"{word}: {count}")


Total rows in dataset: 6348
Number of rows with empty aromas: 652
Combined text length: 206369
Most common words in blank descriptions:
mouth: 830
notes: 607
fruit: 562
end: 468
nose: 423
wine: 390
color: 383
well: 372
fruits: 287
taste: 280
tannins: 264
black: 253
floral: 249
delicate: 229
ruby: 221
length: 220
red: 214
ripe: 211
full: 193
balanced: 192


In [79]:
df.head()

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel,aromas_aroma,categories,subcategories,outer_ring_attributes
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[],"['red color', 'initial aromas', 'Bright red co...",[],[],[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']",['aromatic nose'],[],[],[]
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins'],"['red fruit', 'floral aromas', 'Violet']",['Floral'],['Colored Flowers'],['Violet']
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[],[],[],[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity'],"['yellow color', 'tropical fruit', 'mineral no...",[],[],[]


In [80]:
import re

# (Optional) Define a set of ignore words based on your frequency analysis.
IGNORE_WORDS = {}
#IGNORE_WORDS = {"mouth", "notes", "end", "wine", "color", "well", "length"}

# Updated get_aroma_category using regex for whole-word matching.
def get_aroma_category(token_text):
    token_text_lower = token_text.lower()
    for category, subcategories in AROMA_LEXICON.items():
        for subcategory, attributes in subcategories.items():
            for attribute in attributes:
                # Construct a pattern that matches the attribute as a whole word.
                pattern = r'\b' + re.escape(attribute.lower()) + r'\b'
                if re.search(pattern, token_text_lower):
                    return category, subcategory, attribute
    return None, None, None

# Updated extract_aroma_profile function.
def extract_aroma_profile(review_notes):
    # Process the review note with spaCy.
    doc = nlp(review_notes)
    matches = matcher(doc)
    
    aromas = []
    categories = []
    subcategories = []
    outer_ring_attributes = []

    # First pass: use the spaCy matcher to find candidate phrases.
    for match_id, start, end in matches:
        span = doc[start:end]
        # Create a local context: a few tokens before and after the span.
        local_context_tokens = ([t.text.lower() for t in doc[max(0, start-3):start]] +
                                [t.text.lower() for t in span] +
                                [t.text.lower() for t in doc[end:end+3]])
        # Filter out tokens that are in the ignore list.
        local_context_tokens = [token for token in local_context_tokens if token not in IGNORE_WORDS]

        # Attempt to map the span to an aroma category via the lexicon.
        category, subcategory, attribute = get_aroma_category(span.text)
        if category is not None:
            aromas.append(attribute)
            categories.append(category)
            subcategories.append(subcategory)
            outer_ring_attributes.append(attribute)
        else:
            # If no direct match is found, add the span text if it isn’t an ignore word.
            span_text = span.text.lower().strip()
            if span_text not in IGNORE_WORDS:
                aromas.append(span_text)

    # Fallback: If no aromas were detected by the matcher, check token-by-token.
    if not aromas:
        for token in doc:
            token_text = token.text.lower()
            # Only consider alphabetic tokens and skip common filler words.
            if token.is_alpha and token_text not in IGNORE_WORDS:
                cat, subcat, attr = get_aroma_category(token_text)
                if cat is not None:
                    aromas.append(attr)
                    categories.append(cat)
                    subcategories.append(subcat)
                    outer_ring_attributes.append(attr)
    
    # Remove duplicates before returning the results.
    aromas = list(set(aromas))
    categories = list(set(categories))
    subcategories = list(set(subcategories))
    outer_ring_attributes = list(set(outer_ring_attributes))
    
    return {
        "aromas": aromas,
        "categories": categories,
        "subcategories": subcategories,
        "outer_ring_attributes": outer_ring_attributes
    }


In [81]:
# Example review note (adjust the text as needed)
sample_review = "The wine presents a vibrant nose with hints of ripe cherry, a touch of oak, and subtle spice. It also offers delicate floral and mineral nuances."

# Extract the aroma profile using the updated function
output_profile = extract_aroma_profile(sample_review)

# Print the output (using pprint for nicer formatting)
from pprint import pprint
print("Extracted aroma profile:")
pprint(output_profile)


Extracted aroma profile:
{'aromas': ['mineral nuances', 'Cherry', 'subtle spice', 'vibrant nose'],
 'categories': ['Fruity Red Wine'],
 'outer_ring_attributes': ['Cherry'],
 'subcategories': ['Stone Fruits']}


## Restarting Analysis
See if we can bump those numbers UP

In [82]:
#now lets create a csv of ALL THE WINES + WineProfiles included!! 
import pandas as pd

#Import the CSV (TRANSLATED!!)
starting_file = '../../Resources/BlogOsVinhosTranslated_With_WineProfile.csv'

df_start = pd.read_csv(starting_file)
df_start.head(5)

Unnamed: 0,Name,Color,Alcohol_Percentage,Judge_Rating,Review_Notes,Wine_Bottle_Label,URL,Price,color,aromas,flavors,finish,mouthfeel
0,.Beb 2007,Rosé,13.5,15.5,Bright red color and initial aromas dominated ...,It has an attractive color and an intense arom...,https://osvinhos.blogspot.com/2010/03/1232-beb...,5.75,"['red', 'Bright', 'rosé']","['initial aromas', 'red color', 'Bright red co...","['fruity', 'pleasant presence', 'interesting s...","['average end', 'persistent flavor']",[]
1,.Beb 2009,Red,14.0,16.0,"Reddish color and very aromatic nose, where th...",The careful choice of the best installments of...,https://osvinhos.blogspot.com/2013/08/2803-beb...,8.75,['red'],['aromatic nose'],"['fresh', 'delicate', 'beautiful dose', 'spice...","['median persistence', 'average length']","['seductive texture', 'round tannins']"
2,.Beb 2010,Red,14.0,16.0,It presents a pleasantly concentrated ruby ​​c...,The careful choice of the best installments of...,https://osvinhos.blogspot.com/2014/01/2988-beb...,8.75,"['red', 'ruby']","['floral', 'fruity']","['fruity', 'attractive wine', 'several spices'...",['long length'],['round tannins']
3,.Beb 2011,White,13.5,15.5,Bright yellow color and delicately marked nose...,We associated a very old wardrobe vineyard wit...,https://osvinhos.blogspot.com/2013/09/2820-beb...,8.75,"['Bright', 'white']",[],"['yellow color', 'medium structure', 'Bright y...",[],[]
4,.Beb 2012,White,13.5,16.0,It has a slightly pale yellow color and a nose...,We associated the structure and complexity of ...,https://osvinhos.blogspot.com/2015/06/3639-beb...,5.75,['white'],"['pale yellow color', 'yellow color', 'fruity'...","['oaky', 'fresh', 'elegant mineral', 'mineral ...",[],['well acidity']


In [83]:
# Apply the function to each row in the DataFrame
df_start['aroma_profile'] = df_start['Review_Notes'].apply(extract_aroma_profile)



In [84]:
# Expand the aroma_profile dictionary into separate columns
df_aroma_profile = df_start['aroma_profile'].apply(pd.Series)


In [85]:
# Join the new columns to the original DataFrame with suffixes to avoid overlap
df_start = df_start.join(df_aroma_profile, rsuffix='_aroma')


In [86]:
# Drop the original aroma_profile column
df_start.drop(columns=['aroma_profile'], inplace=True)


In [87]:
# List of columns to check for null values
columns_to_check = ["aromas_aroma"]

# Function to check for empty lists
def count_empty_lists(column):
    return column.apply(lambda x: len(x) == 0).sum()

# Sum the empty lists in each column
empty_list_counts = df_start[columns_to_check].apply(count_empty_lists)

# Display the result
print(empty_list_counts)

df_start.columns


aromas_aroma    7
dtype: int64


Index(['Name', 'Color', 'Alcohol_Percentage', 'Judge_Rating', 'Review_Notes',
       'Wine_Bottle_Label', 'URL', 'Price', 'color', 'aromas', 'flavors',
       'finish', 'mouthfeel', 'aromas_aroma', 'categories', 'subcategories',
       'outer_ring_attributes'],
      dtype='object')

In [89]:
df_start.to_csv('../../Resources/nlp_3_final.csv', index=False)