# Data cleaning and preprocessing
This notebook performs data preprocessing on sensory comments from hop and beer evaluations

In [55]:
# Import required packages
import json
import os
import re

import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

print("Package versions:")
print(f"Python: {__import__('sys').version}")
print(f"pandas: {pd.__version__}")
print(f"numpy: {np.__version__}")
print(f"nltk: {nltk.__version__}")

# Download required NLTK data if not already present
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download("punkt", quiet=True)

Package versions:
Python: 3.12.7 (tags/v3.12.7:0b05ead, Oct  1 2024, 03:06:41) [MSC v.1941 64 bit (AMD64)]
pandas: 2.2.3
numpy: 1.26.4
nltk: 3.9.1


### Load data
Load the raw sensory comments data from the CSV file containing evaluations from multiple tasting events

In [56]:
# Load the raw sensory comments data
data = pd.read_csv("Vera Sensory Comments.csv")

# Display basic information about the dataset
print(f"Dataset shape: {data.shape}")
print(f"Columns: {list(data.columns)}")

# Show the distribution of sample types
print(f"\nSample type distribution:")
print(data["Sample"].value_counts())

# Display the first few rows to understand the data structure
print("\nFirst 5 rows:")
print(data.head())

Dataset shape: (268, 3)
Columns: ['Event', 'Sample', 'Comments']

Sample type distribution:
Sample
Dried Hops    181
Beer           87
Name: count, dtype: int64

First 5 rows:
                         Event      Sample           Comments
0  Hopsource 2022 - Washington  Dried Hops  Complex and clean
1  Hopsource 2022 - Washington  Dried Hops       Vinous grape
2  Hopsource 2022 - Washington  Dried Hops        red flowers
3  Hopsource 2022 - Washington  Dried Hops              Muted
4  Hopsource 2022 - Washington  Dried Hops        Some fruity


### Expand acronyms
Identify acronyms in the comments and expand them to their full forms

In [57]:
# Define regex pattern to identify potential acronyms
pattern = r"\b(?:[A-Z]+(?:/[A-Z]+)*|[A-Z]+\d+[A-Z]*)\b"

# Extract all unique acronyms from the comments
acronyms = set()
for comment in data["Comments"]:
    if pd.notna(comment):
        matches = re.findall(pattern, str(comment))
        for match in matches:
            if len(match) > 1 or "/" in match or re.search(r"\d", match):
                acronyms.add(match)

print(f"Identified {len(acronyms)} unique acronyms:")
print(sorted(acronyms))

Identified 8 unique acronyms:
['DMTS', 'GO', 'IPA', 'IPL', 'O/G', 'ONG', 'T2N', 'WOW']


In [58]:
# Define mappings for domain-specific acronyms to their full forms
# WOW is the word 'wow' in all caps
acronym_map = {
    "GO": "onion garlic",
    "O/G": "onion garlic",
    "ONG": "onion garlic",
    "DMTS": "dimethyl trisulfide",
    "T2N": "trans-2-nonenal",
    "H2S": "hydrogen sulfide",
    "IPA": "india pale ale",
    "IPL": "india pale lager",
}

# Apply each acronym replacement
for abbr, full in acronym_map.items():
    pattern = rf"\b{re.escape(abbr)}\b"
    data["Comments"] = data["Comments"].str.replace(
        pattern, full, regex=True, case=False
    )

# Handle special cases
# Plural forms of IPA
ipa_pattern = r"\bindia pale ale(s?)\b"
plural_count = (
    data["Comments"].str.count(r"\bindia pale ales\b", flags=re.IGNORECASE).sum()
)

# 'ohai' (Overall hop aroma intensity)
ohai_count = data["Comments"].str.count(r"\bohai\b", flags=re.IGNORECASE).sum()
if ohai_count > 0:
    data["Comments"] = data["Comments"].str.replace(
        r"\bohai\b", "overall hop aroma intensity", case=False, regex=True
    )

### Reshape data
Transform the tabular data into a structured JSON format suitable for downstream text analysis

In [59]:
# Extract relevant columns and rename for consistency
records = (
    data[["Sample", "Comments"]]
    .rename(columns={"Sample": "sample", "Comments": "text"})
    .to_dict(orient="records")
)

print(f"\nFirst 5 records:")
print(json.dumps(records[:5], indent=2, ensure_ascii=False))


First 5 records:
[
  {
    "sample": "Dried Hops",
    "text": "Complex and clean"
  },
  {
    "sample": "Dried Hops",
    "text": "Vinous grape"
  },
  {
    "sample": "Dried Hops",
    "text": "red flowers"
  },
  {
    "sample": "Dried Hops",
    "text": "Muted"
  },
  {
    "sample": "Dried Hops",
    "text": "Some fruity"
  }
]


### Count words
Look at summary statistics to understand the distribution of text length across different sample types.

In [60]:
# Separate comments by sample type
hops_texts = [item["text"] for item in records if item["sample"] == "Dried Hops"]
beer_texts = [item["text"] for item in records if item["sample"] == "Beer"]

print(f"Count of hop comments: {len(hops_texts)}")
print(f"Count of beer comments: {len(beer_texts)}")

# Show some example comments from each group
print(f"\nExample hop comments:")
for i, text in enumerate(hops_texts[:3]):
    print(f'  {i+1}. "{text}"')

print(f"\nExample beer comments:")
for i, text in enumerate(beer_texts[:3]):
    print(f'  {i+1}. "{text}"')

Count of hop comments: 181
Count of beer comments: 87

Example hop comments:
  1. "Complex and clean"
  2. "Vinous grape"
  3. "red flowers"

Example beer comments:
  1. "Good"
  2. "Woah. Interesting. Solvent notes up front - not acetone like Sabro, but not toluene or methanol either. Orange terpenes/concentrate, green onion, and non-specific tropical esters"
  3. "Weird aroma"


In [61]:
# Define function to count words using NLTK tokenizer
def word_count(text):
    """Count words in text using NLTK word tokenizer"""
    if pd.isna(text):
        return 0
    return len(word_tokenize(str(text)))


# Calculate word counts for each sample type
hops_word_counts = [word_count(text) for text in hops_texts]
beer_word_counts = [word_count(text) for text in beer_texts]

# Calculate percentiles for distribution analysis
p10_hops, p25_hops, p75_hops, p90_hops = np.percentile(
    hops_word_counts, [10, 25, 75, 90]
)
p10_beer, p25_beer, p75_beer, p90_beer = np.percentile(
    beer_word_counts, [10, 25, 75, 90]
)

print(f"\nDried hops comments (n={len(hops_texts)}):")
print(f"  Mean:           {np.mean(hops_word_counts):.2f} words")
print(f"  Median:         {np.median(hops_word_counts):.1f} words")
print(f"  Standard dev:   {np.std(hops_word_counts):.2f} words")
print(f"  Min-max:        {np.min(hops_word_counts)}-{np.max(hops_word_counts)} words")
print(f"  25th-75th %ile: {p25_hops:.0f}-{p75_hops:.0f} words")
print(f"  10th-90th %ile: {p10_hops:.0f}-{p90_hops:.0f} words")

print(f"\nBeer comments (n={len(beer_texts)}):")
print(f"  Mean:           {np.mean(beer_word_counts):.2f} words")
print(f"  Median:         {np.median(beer_word_counts):.1f} words")
print(f"  Standard dev:   {np.std(beer_word_counts):.2f} words")
print(f"  Min-max:        {np.min(beer_word_counts)}-{np.max(beer_word_counts)} words")
print(f"  25th-75th %ile: {p25_beer:.0f}-{p75_beer:.0f} words")
print(f"  10th-90th %ile: {p10_beer:.0f}-{p90_beer:.0f} words")


Dried hops comments (n=181):
  Mean:           5.76 words
  Median:         4.0 words
  Standard dev:   5.68 words
  Min-max:        1-41 words
  25th-75th %ile: 2-7 words
  10th-90th %ile: 1-13 words

Beer comments (n=87):
  Mean:           10.74 words
  Median:         8.0 words
  Standard dev:   10.30 words
  Min-max:        1-43 words
  25th-75th %ile: 3-14 words
  10th-90th %ile: 1-27 words


### Save cleaned data
Export the cleaned and standardized comments to a JSON

In [62]:
# Save the cleaned and reshaped data to JSON
output_filename = "comments_cleaned.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(records, f, indent=4, ensure_ascii=False)

print(f"Data successfully saved to '{output_filename}'")

# Verify the saved file
import os

file_size = os.path.getsize(output_filename)
print(f"File size: {file_size:,} bytes ({file_size/1024:.1f} KB)")

Data successfully saved to 'comments_cleaned.json'
File size: 28,011 bytes (27.4 KB)
