## Hugging Face Model

In [2]:
import pandas as pd
import numpy as np
import time

from IPython.display import Markdown, display

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

**Load the data set**

See [Datasets](https://huggingface.co/docs/datasets/index) documentation on HuggingFace

In [6]:
df = pd.read_parquet('files/wine_review.parquet.gzip')
print(df.shape)
df.head()

(100538, 16)


Unnamed: 0,country,description,points,price,taster_name,title,variety,winery,year,wine_style,type,quality,classification,location,band,preprocessed_description
0,Italy,"Aromas include tropical fruit, broom, brimston...",87,,Kerin O’Keefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013,light_white,white,medium,Old World,Etna,,aroma include tropical fruit broom brimstone...
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,medium_red,red,medium,Old World,Douro,popular,do ripe fruity wine smooth structure firm tann...
2,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,light_white,white,medium,New World,Oregon,popular,rainstorm tart snappy flavor lime flesh rind d...
3,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,aromatic_white,white,medium,New World,Michigan,popular,pineapple rind lemon pith orange blossom start...
4,US,"Much like the regular bottling from 2012, this...",87,65.0,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,light_red,red,medium,New World,Oregon,luxury,vintner like regular bottling come rough tanni...


In [7]:
n = 1000
s = df.sample(n, random_state=42)
s.shape

(1000, 16)

In [9]:
from datasets import Dataset
wine_dataset = Dataset.from_pandas(s, preserve_index=True)
wine_dataset

Dataset({
    features: ['country', 'description', 'points', 'price', 'taster_name', 'title', 'variety', 'winery', 'year', 'wine_style', 'type', 'quality', 'classification', 'location', 'band', 'preprocessed_description', '__index_level_0__'],
    num_rows: 1000
})

**Create HuggingFace Model**

In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load model from HuggingFace Hub
model_ckpt = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model

# Use GPU to speed up embedding process
if torch.cuda.is_available():
    num_devices = torch.cuda.device_count()
    print(f"Number of CUDA devices: {num_devices}")
    for i in range(num_devices):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. Using CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

CUDA is not available. Using CPU.
Using device: cpu


MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

**Helper Functions to create Text Embeddings using the HiggingFace Model**

In [11]:
# Pool token embeddings 

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Function to get the embeddings from wine description input 
def get_embeddings(text_input):
  # Tokenize sentences
  encoded_input = tokenizer(text_input, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
  with torch.no_grad():
    model_output = model(**encoded_input)
  # Perform pooling
  text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  # Normalize embeddings
  text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
  return text_embeddings 

**Select a review to try out the Embedding Logic**

In [12]:
review = wine_dataset[2]
description = review['description']
tokens = review['preprocessed_description']
display(Markdown(f'({len(description)}, {len(description.split(' '))}): ' + description))
display(Markdown(f'({len(tokens)}, {len(tokens.split(' '))}): ' + tokens))

(198, 32): This uber-informal wine has faint aromas that recall pressed wildflower and a whiff of pear. The simple, diluted palate shows hints of apple skin and a bitter medicinal note alongside brisk acidity.

(168, 25): lechthaler di uber informal wine faint aroma recall press wildflower whiff pear simple diluted palate show hint apple skin bitter medicinal note alongside brisk acidity

**Try the Embedding Logic**

In [13]:
print(get_embeddings(description).detach().cpu().numpy().shape, get_embeddings(tokens).detach().cpu().numpy().shape)

(1, 768) (1, 768)


**Apply Embeddings on Vector of Descriptions**

In [14]:
import time

start = time.perf_counter()
# Now create new embeddings column for entire dataset
embeddings_dataset = wine_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['description'])
    .detach().cpu().numpy()})
elapsed = time.perf_counter() - start
display(Markdown(f'It took {elapsed/60:.0f} minutes to compute embeddings for {wine_dataset.num_rows:,d} samples.  It will take {df.shape[0]/s.shape[0]*elapsed/60:.0f} minutes to compute embeddings for {df.shape[0]:,d} reviews.'))
embeddings_dataset


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

It took 1 minutes to compute embeddings for 1,000 samples.  It will take 150 minutes to compute embeddings for 100,538 reviews.

In [15]:
embeddings_dataset

Dataset({
    features: ['country', 'description', 'points', 'price', 'taster_name', 'title', 'variety', 'winery', 'year', 'wine_style', 'type', 'quality', 'classification', 'location', 'band', 'preprocessed_description', '__index_level_0__', 'embeddings'],
    num_rows: 1000
})

In [17]:
## TODO determine what this does
embeddings_dataset = embeddings_dataset.with_format("np")
embeddings_dataset[1]['embeddings'].shape

(1, 768)

In [18]:
# Reformat so can work with FAISS
def process_embeddings(example):
    example['embeddings'] = np.squeeze(example['embeddings']).astype(np.float32)
    return example

embeddings_dataset = embeddings_dataset.map(process_embeddings)

embeddings_dataset[1]['embeddings'].shape

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

(768,)

In [20]:
columns_to_remove = ['__index_level_0__']
embeddings_dataset = embeddings_dataset.remove_columns(columns_to_remove)
embeddings_dataset

Dataset({
    features: ['country', 'description', 'points', 'price', 'taster_name', 'title', 'variety', 'winery', 'year', 'wine_style', 'type', 'quality', 'classification', 'location', 'band', 'preprocessed_description', 'embeddings'],
    num_rows: 1000
})

In [22]:
# embeddings_dataset = embeddings_dataset.drop_index("embeddings")
# ds_path = "files/wine_embeddings.hf"
new_ds_path = "files/wine_ds.hf"
embeddings_dataset.save_to_disk(new_ds_path)

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

**Similarity Search with FAISS**

In [23]:
print(embeddings_dataset['embeddings'][0].shape)  # shape of an individual sentence embedding
print(len(embeddings_dataset['embeddings']))  # number of sentence embeddings in the "embeddings" column

(768,)
1000


In [24]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['country', 'description', 'points', 'price', 'taster_name', 'title', 'variety', 'winery', 'year', 'wine_style', 'type', 'quality', 'classification', 'location', 'band', 'preprocessed_description', 'embeddings'],
    num_rows: 1000
})

In [25]:
test_embedding = embeddings_dataset['embeddings'][132]

scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", test_embedding, k=10)

In [26]:
samples['title']

array(['Seifried 2007 Sauvignon Blanc (Nelson)',
       'Cascina Adelaide 2013 Pernanno  (Barolo)',
       'Errazuriz 2009 Estate Carmenère (Central Valley)',
       'Cave de Kientzheim-Kaysersberg 2010 The Furst Pinot Blanc (Alsace)',
       'Tenuta Carretta 2008 Bordino  (Barbaresco)',
       'Henri de Villamont 2009 Clos St-Jean Premier Cru  (Chassagne-Montrachet)',
       'Galadino 2010 Riserva  (Chianti Classico)',
       'Jardin 2007 Syrah (Stellenbosch)',
       'Marchesi Antinori 2012 Riserva  (Chianti Classico)',
       'Kerloo 2013 Les Collines Limited Release Syrah (Walla Walla Valley (WA))'],
      dtype='<U72')

In [27]:
for d in samples['description']:
    print(d)

Intensely perfumed, this is herbal but ripe, too, with fig and melon fruit aromas. It's boldly flavored in the same vein, with the only quibble being that it finishes a little short. A solid effort.
This has subdued aromas suggesting aromatic herb, forest floor, truffle and a whiff of leather. It's assertive and rather lean in terms of fruit richness, displaying sour cherry, pomegranate, espresso, clove and a note of powdered sage. Tight, grainy tannins leave a firm finish.
Herbal, brambly aromas mix with dark berry, mocha and vanilla aromas, while the palate is creamy and soft, with herbal, almost vegetal flavors and not much defined fruit or clarity. Finishes as herbal as it starts, with coffee and green tobacco flavors.
This is fruity, fragrant and fresh. It has creamed pear and tangy orange flavors that are laced with bright acidity. The crisp lime note gives a final refreshing burst.
Simple and linear, this has a mixed berry aroma, with notes of licorice, leather and dried tobacco

In [28]:
scores

array([0.        , 0.40508103, 0.4902803 , 0.49871844, 0.51951927,
       0.5293214 , 0.537715  , 0.5437931 , 0.5476136 , 0.5484359 ],
      dtype=float32)

In [43]:
samples_df = pd.DataFrame({c: list(samples[c]) for c in samples})
samples_df['scores'] = scores
samples_df.sort_values('scores', ascending=False, inplace=True)
samples_df

Unnamed: 0,country,description,points,price,taster_name,title,variety,winery,year,wine_style,type,quality,classification,location,band,preprocessed_description,embeddings,scores
9,US,The dried herb and violet aromas are high tone...,91,48.0,Sean P. Sullivan,Kerloo 2013 Les Collines Limited Release Syrah...,Syrah,Kerloo,2013,full_red,red,high,New World,Washington,ultra,dry herb violet aroma high toned lock floral s...,"[0.034957066, 0.049313437, -0.031035056, -0.00...",0.548436
8,Italy,A blend of 90% Sangiovese and 10% Cabernet Sau...,92,45.0,Kerin O’Keefe,Marchesi Antinori 2012 Riserva (Chianti Class...,Red Blend,Marchesi Antinori,2012,medium_red,red,high,Old World,Tuscany,ultra,blend % % enticing scent fragrant blue flowe...,"[0.017769974, 0.019162761, -0.033497773, 0.001...",0.547614
7,South Africa,"A good amount of earthy spice, tea leaves and ...",89,19.0,Lauren Buzzeo,Jardin 2007 Syrah (Stellenbosch),Syrah,Jardin,2007,full_red,red,medium,New World,Stellenbosch,premium,good earthy spice tea leave lead way nose blac...,"[0.055925604, 0.045839585, -0.031539824, -0.03...",0.543793
6,Italy,This opens with an unusual aroma that recalls ...,83,7.0,Kerin O’Keefe,Galadino 2010 Riserva (Chianti Classico),Red Blend,Galadino,2010,medium_red,red,medium,Old World,Tuscany,value,open unusual aroma recall floral air freshen...,"[0.0673587, 0.04662612, -0.023410415, -0.02665...",0.537715
5,France,"This is aromatic, with ripeness that balances ...",91,65.0,Roger Voss,Henri de Villamont 2009 Clos St-Jean Premier C...,Chardonnay,Henri de Villamont,2009,full_white,white,high,Old World,Burgundy,luxury,aromatic ripeness balance bright green white...,"[0.031131092, 0.053116564, -0.010044775, 0.001...",0.529321
4,Italy,"Simple and linear, this has a mixed berry arom...",87,45.0,Unknown,Tenuta Carretta 2008 Bordino (Barbaresco),Nebbiolo,Tenuta Carretta,2008,full_red,red,medium,Old World,Piedmont,ultra,simple linear mixed berry aroma note licoric...,"[0.049561225, 0.03669533, -0.01353773, 0.01468...",0.519519
3,France,"This is fruity, fragrant and fresh. It has cre...",85,,Roger Voss,Cave de Kientzheim-Kaysersberg 2010 The Furst ...,Pinot Blanc,Cave de Kientzheim-Kaysersberg,2010,light_white,white,medium,Old World,Alsace,,de fruity fragrant fresh cream pear tangy oran...,"[0.03936654, 0.025942875, -0.014210853, -0.030...",0.498718
2,Chile,"Herbal, brambly aromas mix with dark berry, mo...",84,12.0,Michael Schachner,Errazuriz 2009 Estate Carmenère (Central Valley),Carmenère,Errazuriz,2009,medium_red,red,medium,New World,Central Valley,value,brambly aroma mix dark berry mocha vanilla aro...,"[0.052927148, 0.029348643, -0.02238213, -0.022...",0.49028
1,Italy,This has subdued aromas suggesting aromatic he...,88,70.0,Kerin O’Keefe,Cascina Adelaide 2013 Pernanno (Barolo),Nebbiolo,Cascina Adelaide,2013,full_red,red,medium,Old World,Piedmont,luxury,subdue aroma suggest aromatic herb forest fl...,"[0.048809845, 0.06395473, -0.019677524, 0.0070...",0.405081
0,New Zealand,"Intensely perfumed, this is herbal but ripe, t...",87,18.0,Joe Czerwinski,Seifried 2007 Sauvignon Blanc (Nelson),Sauvignon Blanc,Seifried,2007,light_white,white,medium,New World,Nelson,premium,intensely perfumed herbal ripe fig melon fruit...,"[0.059339624, 0.0740464, -0.016037049, 0.00760...",0.0


In [29]:
faiss_index_path = "files/wine_faiss_index.faiss" 
embeddings_dataset.save_faiss_index('embeddings', faiss_index_path)

In [31]:
# When reloading
from datasets import load_dataset, load_from_disk
ds_path = "files/wine_embeddings.hf"
faiss_index_path = "files/wine_faiss_index.faiss" 

ds = load_from_disk(new_ds_path)
ds.load_faiss_index('embeddings', faiss_index_path)