# Keywords and phrases extraction
This notebook analyzes key words, phrases, and linguistic patterns from cleaned sensory comments

In [1]:
# Import required packages
import json
import random
import re
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
import spacy

print("Package versions:")
print(f"Python: {__import__('sys').version}")
print(f"pandas: {pd.__version__}")
print(f"nltk: {nltk.__version__}")
print(f"spacy: {spacy.__version__}")

# Download required NLTK data if not already present
try:
    nltk.data.find("corpora/stopwords")
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download("stopwords", quiet=True)

try:
    nltk.data.find("corpora/wordnet")
except LookupError:
    print("Downloading NLTK wordnet...")
    nltk.download("wordnet", quiet=True)

# Load spacy model
try:
    nlp = spacy.load("en_core_web_sm")
    print(f"Loaded spaCy model: en_core_web_sm")
except OSError:
    print("Error: spaCy model 'en_core_web_sm' not found. Please install with:")
    print("python -m spacy download en_core_web_sm")

Package versions:
Python: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
pandas: 2.2.3
nltk: 3.9.1
spacy: 3.8.4
Downloading NLTK wordnet...
Loaded spaCy model: en_core_web_sm


### Load data
Load the cleaned sensory comments data from the previous notebook

In [2]:
# Load cleaned comments data
with open("comments_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} cleaned comments")

Loaded 268 cleaned comments


In [3]:
# Display a random sample of comments
print("Random sample of comments:")

sample_comments = random.sample(data, 10)
for i, comment in enumerate(sample_comments, 1):
    print(f"{comment['text']}")

Random sample of comments:
Crisp
Better as it warms up 
Dank, mango, bitter, citrus
Delightful citrus fruity tropical
Cowabunga! Dole pineapple juice and mango nectar. Thiol forward with healthy undertones of sweetened lime juice/Newmans Own limeade. Not incredibly complex, but it doesn’t need to be. Well balanced, excellent intensity.
Mild flavor overall. Sample is slightly sweet, trace tropical fruit, low to moderate bitterness, with a trace of resiny finish. 
Pleasant rose/rose hop flavor aroma with a juicy finish, pineapple, melon, cherry hint of an herbal/grassy finish but nice
Pleasant aroma
Touch of herb to make it natural and not artificial
Clove, banana


In [4]:
# Separate comments by sample type
hops_texts = [item["text"] for item in data if item["sample"] == "Dried Hops"]
beer_texts = [item["text"] for item in data if item["sample"] == "Beer"]

print(f"Sample distribution:")
print(
    f"  Dried hops comments: {len(hops_texts):3d} ({len(hops_texts)/len(data)*100:.1f}%)"
)
print(
    f"  Beer comments:       {len(beer_texts):3d} ({len(beer_texts)/len(data)*100:.1f}%)"
)
print(f"  Total comments:      {len(data):3d}")

Sample distribution:
  Dried hops comments: 181 (67.5%)
  Beer comments:        87 (32.5%)
  Total comments:      268


### N-gram frequency analysis
Analyze the most frequently occurring single words (unigrams), word pairs (bigrams), and three-word combinations (trigrams)

In [5]:
# Set up text preprocessing components
stop_words = set(stopwords.words("english"))

# Keep some negation and intensity words that are important for sensory analysis
important_words = {"not", "no", "very", "never"}
stop_words -= important_words

lemmatizer = WordNetLemmatizer()

In [6]:
# Process dried hops comments for n-gram extraction
hops_words = []
hops_bigrams = []
hops_trigrams = []

for text in hops_texts:
    # Clean and preprocess text
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove digits
    tokens = word_tokenize(text)

    # Filter and lemmatize tokens
    cleaned_tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]

    # Collect unigrams
    hops_words.extend(cleaned_tokens)

    # Generate bigrams
    for i in range(len(cleaned_tokens) - 1):
        hops_bigrams.append(f"{cleaned_tokens[i]} {cleaned_tokens[i+1]}")

    # Generate trigrams
    for i in range(len(cleaned_tokens) - 2):
        hops_trigrams.append(
            f"{cleaned_tokens[i]} {cleaned_tokens[i+1]} {cleaned_tokens[i+2]}"
        )

# Count n-grams
hops_unigram_counts = Counter(hops_words)
hops_bigram_counts = Counter(hops_bigrams)
hops_trigram_counts = Counter(hops_trigrams)

# Display results
print("\nDried hops n-gram analysis:")

print(f"\nTop 10 unigrams:")
for i, (word, count) in enumerate(hops_unigram_counts.most_common(10), 1):
    print(f"  {i:2d}. {word:<15} ({count:2d})")

print(f"\nTop 10 bigrams:")
for i, (bigram, count) in enumerate(hops_bigram_counts.most_common(10), 1):
    print(f"  {i:2d}. {bigram:<20} ({count:2d})")

# Filter trigrams that occur more than once
frequent_trigrams = [(tg, c) for tg, c in hops_trigram_counts.items() if c > 1]
frequent_trigrams.sort(key=lambda x: x[1], reverse=True)

print(f"\nTop trigrams:")
for i, (trigram, count) in enumerate(frequent_trigrams[:10], 1):
    print(f"  {i:2d}. {trigram:<25} ({count:2d})")


Dried hops n-gram analysis:

Top 10 unigrams:
   1. fruit           (29)
   2. tropical        (27)
   3. citrus          (25)
   4. peach           (16)
   5. garlic          (15)
   6. onion           (15)
   7. mango           (14)
   8. pineapple       (13)
   9. fruity          (11)
  10. bright          (11)

Top 10 bigrams:
   1. onion garlic         (11)
   2. stone fruit          ( 7)
   3. passion fruit        ( 4)
   4. tropical stone       ( 3)
   5. orange zest          ( 3)
   6. pale ale             ( 3)
   7. tropical fruit       ( 3)
   8. slight onion         ( 3)
   9. smell like           ( 3)
  10. very nice            ( 3)

Top trigrams:
   1. tropical stone fruit      ( 3)
   2. slight onion garlic       ( 3)
   3. mix floral citrus         ( 2)
   4. india pale ale            ( 2)
   5. melon stone fruit         ( 2)


In [7]:
# Process beer comments for n-gram extraction
beer_words = []
beer_bigrams = []
beer_trigrams = []

for text in beer_texts:
    # Clean and preprocess text
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove digits
    tokens = word_tokenize(text)

    # Filter and lemmatize tokens
    cleaned_tokens = [
        lemmatizer.lemmatize(tok)
        for tok in tokens
        if tok not in stop_words and len(tok) > 2
    ]

    # Collect unigrams
    beer_words.extend(cleaned_tokens)

    # Generate bigrams
    for i in range(len(cleaned_tokens) - 1):
        beer_bigrams.append(f"{cleaned_tokens[i]} {cleaned_tokens[i+1]}")

    # Generate trigrams
    for i in range(len(cleaned_tokens) - 2):
        beer_trigrams.append(
            f"{cleaned_tokens[i]} {cleaned_tokens[i+1]} {cleaned_tokens[i+2]}"
        )

# Count n-grams
beer_unigram_counts = Counter(beer_words)
beer_bigram_counts = Counter(beer_bigrams)
beer_trigram_counts = Counter(beer_trigrams)

# Display results
print("Beer n-gram analysis:")

print(f"\nTop 10 unigrams:")
for i, (word, count) in enumerate(beer_unigram_counts.most_common(10), 1):
    print(f"  {i:2d}. {word:<15} ({count:2d})")

print(f"\nTop 10 bigrams:")
for i, (bigram, count) in enumerate(beer_bigram_counts.most_common(10), 1):
    print(f"  {i:2d}. {bigram:<20} ({count:2d})")

# Filter trigrams that occur more than once
frequent_trigrams = [(tg, c) for tg, c in beer_trigram_counts.items() if c > 1]
frequent_trigrams.sort(key=lambda x: x[1], reverse=True)

print(f"\nTop trigrams:")
for i, (trigram, count) in enumerate(frequent_trigrams[:10], 1):
    print(f"  {i:2d}. {trigram:<25} ({count:2d})")

Beer n-gram analysis:

Top 10 unigrams:
   1. fruit           (23)
   2. aroma           (21)
   3. flavor          (15)
   4. hop             (15)
   5. citrus          (11)
   6. not             (10)
   7. sweet           (10)
   8. pleasant        (10)
   9. bitterness      (10)
  10. tropical        ( 9)

Top 10 bigrams:
   1. stone fruit          ( 5)
   2. hop flavor           ( 5)
   3. sweet fruit          ( 4)
   4. pineapple melon      ( 3)
   5. onion garlic         ( 3)
   6. tropical fruit       ( 3)
   7. hop aroma            ( 3)
   8. passion fruit        ( 2)
   9. aroma citrus         ( 2)
  10. citrus stone         ( 2)

Top trigrams:
   1. aroma citrus stone        ( 2)
   2. citrus stone fruit        ( 2)
   3. not much hop              ( 2)
   4. overall hop aroma         ( 2)
   5. hop aroma intensity       ( 2)
   6. slight onion garlic       ( 2)


### Part-of-speech analysis
Extract and analyze adjectives and adverbs using part-of-speech tagging

In [8]:
# Demonstrate part-of-speech tagging with an example comment
example_idx = 90
example_comment = data[example_idx]["text"]
sample_type = data[example_idx]["sample"]

print(f"Example POS analysis:")
print(f'Comment #{example_idx} [{sample_type}]: "{example_comment}"')

# Process with spaCy
doc = nlp(example_comment)

# Extract adjectives
adjectives = [
    (token.text, token.pos_, token.dep_) for token in doc if token.pos_ == "ADJ"
]
adverbs = [(token.text, token.pos_, token.dep_) for token in doc if token.pos_ == "ADV"]

print(f"\nAdjectives found ({len(adjectives)}):")
if adjectives:
    for word, pos, dep in adjectives:
        print(f"  • {word:<12} (POS: {pos}, dependency: {dep})")
else:
    print("  (none found)")

print(f"\nAdverbs found ({len(adverbs)}):")
if adverbs:
    for word, pos, dep in adverbs:
        print(f"  • {word:<12} (POS: {pos}, dependency: {dep})")
else:
    print("  (none found)")

Example POS analysis:
Comment #90 [Dried Hops]: "floral, almost tea like delicate aromatics."

Adjectives found (2):
  • floral       (POS: ADJ, dependency: amod)
  • delicate     (POS: ADJ, dependency: amod)

Adverbs found (1):
  • almost       (POS: ADV, dependency: advmod)


In [9]:
# Extract adjectives and adverbs from dried hops comments
hops_adjectives = []
hops_adverbs = []

for text in hops_texts:
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "ADJ":
            hops_adjectives.append(token.lemma_.lower())
        elif token.pos_ == "ADV":
            hops_adverbs.append(token.lemma_.lower())

hops_adj_counts = Counter(hops_adjectives)
hops_adv_counts = Counter(hops_adverbs)

# Display results
print("Dried hops POS analysis:")

print(f"\nTop 10 adjectives:")
for i, (adj, count) in enumerate(hops_adj_counts.most_common(10), 1):
    print(f"  {i:2d}. {adj:<15} ({count:2d})")

print(f"\nTop 10 adverbs:")
for i, (adv, count) in enumerate(hops_adv_counts.most_common(10), 1):
    print(f"  {i:2d}. {adv:<15} ({count:2d})")

Dried hops POS analysis:

Top 10 adjectives:
   1. tropical        (25)
   2. bright          (11)
   3. nice            (10)
   4. slight          (10)
   5. sweet           ( 9)
   6. ripe            ( 5)
   7. floral          ( 5)
   8. light           ( 5)
   9. clean           ( 4)
  10. good            ( 4)

Top 10 adverbs:
   1. very            (10)
   2. too             ( 4)
   3. well            ( 3)
   4. really          ( 3)
   5. so              ( 2)
   6. forward         ( 2)
   7. almost          ( 2)
   8. incredibly      ( 1)
   9. there           ( 1)
  10. as              ( 1)


In [10]:
# Extract adjectives and adverbs from beer comments
beer_adjectives = []
beer_adverbs = []

for text in beer_texts:
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "ADJ":
            beer_adjectives.append(token.lemma_.lower())
        elif token.pos_ == "ADV":
            beer_adverbs.append(token.lemma_.lower())

beer_adj_counts = Counter(beer_adjectives)
beer_adv_counts = Counter(beer_adverbs)

# Display results
print("Beer POS analysis:")

print(f"\nTop 10 adjectives:")
for i, (adj, count) in enumerate(beer_adj_counts.most_common(10), 1):
    print(f"  {i:2d}. {adj:<15} ({count:2d})")

print(f"\nTop 10 adverbs:")
for i, (adv, count) in enumerate(beer_adv_counts.most_common(10), 1):
    print(f"  {i:2d}. {adv:<15} ({count:2d})")

Beer POS analysis:

Top 10 adjectives:
   1. sweet           (10)
   2. pleasant        (10)
   3. tropical        ( 9)
   4. slight          ( 9)
   5. clean           ( 8)
   6. mild            ( 7)
   7. low             ( 7)
   8. nice            ( 7)
   9. strong          ( 5)
  10. catty           ( 4)

Top 10 adverbs:
   1. slightly        ( 7)
   2. forward         ( 5)
   3. very            ( 4)
   4. well            ( 3)
   5. really          ( 3)
   6. kind            ( 2)
   7. of              ( 2)
   8. quite           ( 2)
   9. too             ( 2)
  10. maybe           ( 2)


### Onion garlic descriptor analysis
Analyze the frequency of onion/garlic mentions, a common off-flavor

In [11]:
# Count onion/garlic mentions
onion_garlic_pattern = r"\b(onion|garlic)\b"

hops_onion_garlic = sum(
    1 for txt in hops_texts if re.search(onion_garlic_pattern, txt, flags=re.I)
)

beer_onion_garlic = sum(
    1 for txt in beer_texts if re.search(onion_garlic_pattern, txt, flags=re.I)
)

# Display results
print("OG descriptor analysis:")

print(
    f"\nDried hops comments with onion/garlic: {hops_onion_garlic}/{len(hops_texts)} ({hops_onion_garlic / len(hops_texts) * 100:.1f}%)"
)
print(
    f"Beer comments with onion/garlic: {beer_onion_garlic}/{len(beer_texts)} ({beer_onion_garlic / len(beer_texts) * 100:.1f}%)"
)

OG descriptor analysis:

Dried hops comments with onion/garlic: 18/181 (9.9%)
Beer comments with onion/garlic: 5/87 (5.7%)


### Categorical descriptor analysis
Define different descriptor categories (tropical fruits, citrus fruits, etc.) and analyze their prevalence across all comments

In [12]:
# Define descriptor categories with regex patterns
descriptor_categories = {
    "tropical fruits": r"\b(tropical|pineapple|mango|passion|papaya|guava|lychee)\b",
    "citrus fruits": r"\b(citrus|orange|grapefruit|lemon|lime|tangerine|mandarin|yuzu|tangelo)\b",
    "stone fruits": r"\b(stone fruit|peach|nectarine|apricot|plum)\b",
    "fruit (general)": r"\b(fruit|fruity)\b",
    "onion garlic": r"\b(onion|garlic)\b",
    "melons": r"\b(melon|watermelon|cantaloupe|honeydew)\b",
    "berries": r"\b(berry|strawberry|blackberry|blueberry|raspberry)\b",
}

# Calculate prevalence for dried hops
hops_total = len(hops_texts)
hops_rows = []
for label, pattern in descriptor_categories.items():
    n = sum(bool(re.search(pattern, text, flags=re.I)) for text in hops_texts)
    hops_rows.append(
        {
            "category": label,
            "n_comments": n,
            "pct_comments": n / hops_total,
            "n_total": hops_total,
        }
    )
hops_prevalence = pd.DataFrame(hops_rows).sort_values("pct_comments", ascending=False)

# Calculate prevalence for beer
beer_total = len(beer_texts)
beer_rows = []
for label, pattern in descriptor_categories.items():
    n = sum(bool(re.search(pattern, text, flags=re.I)) for text in beer_texts)
    beer_rows.append(
        {
            "category": label,
            "n_comments": n,
            "pct_comments": n / beer_total,
            "n_total": beer_total,
        }
    )
beer_prevalence = pd.DataFrame(beer_rows).sort_values("pct_comments", ascending=False)

# Display results
print("Categorical descriptor analysis:")

print(f"\nDried hops")
print(f"{'Rank':<4} {'Category':<18} {'Count':<7} {'Percentage':<10}")
print("-" * 45)
for i, row in hops_prevalence.iterrows():
    rank = hops_prevalence.index.get_loc(i) + 1
    print(
        f"{rank:<4} {row['category']:<18} {row['n_comments']:<7} {row['pct_comments']*100:>7.1f}%"
    )

print(f"\nBeer")
print(f"{'Rank':<4} {'Category':<18} {'Count':<7} {'Percentage':<10}")
print("-" * 45)
for i, row in beer_prevalence.iterrows():
    rank = beer_prevalence.index.get_loc(i) + 1
    print(
        f"{rank:<4} {row['category']:<18} {row['n_comments']:<7} {row['pct_comments']*100:>7.1f}%"
    )

Categorical descriptor analysis:

Dried hops
Rank Category           Count   Percentage
---------------------------------------------
1    tropical fruits    48         26.5%
2    citrus fruits      45         24.9%
3    fruit (general)    39         21.5%
4    stone fruits       23         12.7%
5    onion garlic       18          9.9%
6    melons             8           4.4%
7    berries            7           3.9%

Beer
Rank Category           Count   Percentage
---------------------------------------------
1    tropical fruits    21         24.1%
2    fruit (general)    21         24.1%
3    citrus fruits      18         20.7%
4    stone fruits       9          10.3%
5    onion garlic       5           5.7%
6    melons             5           5.7%
7    berries            0           0.0%
