## Hugging Face Model

In [None]:
import pandas as pd
import numpy as np

from IPython.display import Markdown, display

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

**Load the data set**

See [Datasets](https://huggingface.co/docs/datasets/index) documentation on HuggingFace

In [None]:
df = pd.read_parquet('files/wine_review.parquet.gzip')
print(df.shape)
df.head()

In [None]:
n = 1000
s = df.sample(n, random_state=42)
s.shape

In [None]:
import time

def timeit(purpose, func, count, items='documents'):
  start = time.perf_counter()
  try:
    return func()
  finally:
    elapsed = time.perf_counter() - start
    display(Markdown(f'It took ${elapsed/60:.1f}$ minutes to {purpose} for ${count:,d}$ {items}.'))

In [None]:
from sentence_transformers import SentenceTransformer

def compute_embeddings(corpus, model_name='all-mpnet-base-v2'):
  # timing helper
  __t = lambda purpose, func: timeit(purpose, func, len(corpus), 'reviews')

  # Calculate embeddings from pre-trained sentence transformer model
  model = SentenceTransformer(model_name)
  return __t('compute embeddings', lambda: model.encode(corpus, normalize_embeddings=True, show_progress_bar=True, device='cpu'))


In [None]:
from sklearn.manifold import TSNE

def visualize_embeddings(embeddings):
  # timing helper
  __t = lambda purpose, func: timeit(purpose, func, embeddings.shape[0], 'reviews')

  # perform t-SNE dimension reduction onto 2D for plotting
  tsne = TSNE(perplexity=10, random_state=42)
  tsne_result = __t('perform t-SNE dimension reduction on embeddings', lambda: tsne.fit_transform(embeddings))

  # create a scatter plot
  plt.scatter(tsne_result[:,0], tsne_result[:,1], s=0.005)
  plt.title('Wine t-SNE')
  plt.axis('off')
  plt.show()

In [None]:
# compute the embeddings for the default model, which is all-mpnet-base-v2
mpnet_embeddings = compute_embeddings(df.description.to_list())
mpnet_embeddings.shape

In [None]:
visualize_embeddings(mpnet_embeddings)

In [None]:
raise NotImplemented()

In [None]:
from datasets import Dataset
wine_dataset = Dataset.from_pandas(s, preserve_index=True)
wine_dataset

**Create HuggingFace Model**

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load model from HuggingFace Hub
model_ckpt = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model

# Use GPU to speed up embedding process
if torch.cuda.is_available():
    num_devices = torch.cuda.device_count()
    print(f"Number of CUDA devices: {num_devices}")
    for i in range(num_devices):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. Using CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

**Helper Functions to create Text Embeddings using the HiggingFace Model**

In [None]:
# Pool token embeddings 

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
  return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Function to get the embeddings from wine description input 
def get_embeddings(text_input):
  # Tokenize sentences
  encoded_input = tokenizer(text_input, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
  with torch.no_grad():
    model_output = model(**encoded_input)
  # Perform pooling
  text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  # Normalize embeddings
  text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
  return text_embeddings 

**Select a review to try out the Embedding Logic**

In [None]:
review = wine_dataset[2]
description = review['description']
tokens = review['preprocessed_description']
display(Markdown(f'({len(description)}, {len(description.split(' '))}): ' + description))
display(Markdown(f'({len(tokens)}, {len(tokens.split(' '))}): ' + tokens))

**Try the Embedding Logic**

In [None]:
print(get_embeddings(description).detach().cpu().numpy().shape, get_embeddings(tokens).detach().cpu().numpy().shape)

**Apply Embeddings on Vector of Descriptions**

In [None]:
import time

start = time.perf_counter()
# Now create new embeddings column for entire dataset
embeddings_dataset = wine_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['description'])
    .detach().cpu().numpy()})
elapsed = time.perf_counter() - start
display(Markdown(f'It took {elapsed/60:.0f} minutes to compute embeddings for {wine_dataset.num_rows:,d} samples.  It will take {df.shape[0]/s.shape[0]*elapsed/60:.0f} minutes to compute embeddings for {df.shape[0]:,d} reviews.'))
embeddings_dataset


In [None]:
embeddings_dataset

In [None]:
## TODO determine what this does
embeddings_dataset = embeddings_dataset.with_format("np")
embeddings_dataset[1]['embeddings'].shape

In [None]:
# Reformat so can work with FAISS
def process_embeddings(example):
    example['embeddings'] = np.squeeze(example['embeddings']).astype(np.float32)
    return example

embeddings_dataset = embeddings_dataset.map(process_embeddings)

embeddings_dataset[1]['embeddings'].shape

In [None]:
columns_to_remove = ['__index_level_0__']
embeddings_dataset = embeddings_dataset.remove_columns(columns_to_remove)
embeddings_dataset

In [None]:
# embeddings_dataset = embeddings_dataset.drop_index("embeddings")
# ds_path = "files/wine_embeddings.hf"
new_ds_path = "files/wine_ds.hf"
embeddings_dataset.save_to_disk(new_ds_path)

**Similarity Search with FAISS**

In [None]:
print(embeddings_dataset['embeddings'][0].shape)  # shape of an individual sentence embedding
print(len(embeddings_dataset['embeddings']))  # number of sentence embeddings in the "embeddings" column

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
test_embedding = embeddings_dataset['embeddings'][132]

scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", test_embedding, k=10)

In [None]:
samples['title']

In [None]:
for d in samples['description']:
    print(d)

In [None]:
scores

In [None]:
samples_df = pd.DataFrame({c: list(samples[c]) for c in samples})
samples_df['scores'] = scores
samples_df.sort_values('scores', ascending=False, inplace=True)
samples_df

In [None]:
faiss_index_path = "files/wine_faiss_index.faiss" 
embeddings_dataset.save_faiss_index('embeddings', faiss_index_path)

In [None]:
# When reloading
from datasets import load_dataset, load_from_disk
ds_path = "files/wine_embeddings.hf"
faiss_index_path = "files/wine_faiss_index.faiss" 

ds = load_from_disk(new_ds_path)
ds.load_faiss_index('embeddings', faiss_index_path)

**Compute Embeddings on Entire Dataset**

In [None]:
df.info()

In [None]:
wine_dataset = Dataset.from_pandas(df)

start = time.perf_counter()
# Now create new embeddings column for entire dataset
embeddings_dataset = wine_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['description'])
    .detach().cpu().numpy()})
elapsed = time.perf_counter() - start
display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to compute embeddings for ${df.shape[0]:,d}$ reviews.'))

# reshape for FAISS index
embeddings_dataset = embeddings_dataset.with_format("np")
embeddings_dataset = embeddings_dataset.map(process_embeddings)

# save embeddings to disk
ds_path = "files/wine_ds.hf"
embeddings_dataset.save_to_disk(ds_path)

# add similarity index
embeddings_dataset.add_faiss_index(column="embeddings")
faiss_index_path = "files/wine_faiss_index.faiss"
embeddings_dataset.save_faiss_index('embeddings', faiss_index_path)


**Visualize the Results**

In [None]:
ds_path = "files/wine_ds.hf"
embeddings_dataset = load_from_disk(ds_path)
embeddings_dataset

In [None]:
print(embeddings_dataset['embeddings'][0].shape)  # shape of an individual sentence embedding
print(len(embeddings_dataset['embeddings']))  # number of sentence embeddings in the "embeddings" column

In [None]:
# Switching to pandas 
embeddings_dataset.set_format("pandas")
df = embeddings_dataset[:]
print(f"df shape with duplicates: {df.shape}")
df = df.drop_duplicates(subset='description', keep="first")
print(f"df shape without duplicates: {df.shape}")

**Similarity Search with FAISS**

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
# Test query
test_embedding = embeddings_dataset['embeddings'][132]
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", test_embedding, k=6)
samples['title']

In [None]:
samples.pop('embeddings')
samples.pop('__index_level_0__')
samples.keys()

In [None]:
samples

In [None]:
# Switching to pandas 
embeddings_dataset.set_format("pandas")
df = embeddings_dataset[:]
print(f"df shape with duplicates: {df.shape}")
df = df.drop_duplicates(subset='description', keep="first")
print(f"df shape without duplicates: {df.shape}")

In [None]:
# Subset pandas df
subset_df = df.loc[df['country']=='US', :]
subset_df.shape

In [None]:
# Convert pandas df tof hf ds
subset = Dataset.from_pandas(subset_df.drop('__index_level_0__', axis=1))

In [None]:
# Try and to NN on new subset
subset.reset_format()
subset = subset.with_format("np")
subset.add_faiss_index(column="embeddings")
test_embedding = subset['embeddings'][132]
print(test_embedding.shape)
scores, samples = subset.get_nearest_examples(
    "embeddings", test_embedding, k=6)
samples['title']

In [None]:
for d in samples['description']:
    print(d)

In [None]:
# To reset from pandas to arrow
embeddings_dataset.reset_format()

**Varieties**

In [None]:
# Create chunks for analysis

varieties = list(df.variety.unique())
# Determine the chunk size
chunk_size = len(varieties) // 8
# Split the list into chunks
chunks = [varieties[i:i+chunk_size] for i in range(0, len(varieties), chunk_size)]
# Handle any remaining elements
if len(chunks) < 8:
    chunks[-1].extend(varieties[len(chunks)*chunk_size:])

#print chunks
for i in range(len(chunks)):
    print(chunks[i])

In [None]:
white = ['White Blend', 'Pinot Gris', 'Riesling', 'Chardonnay', 'Chenin Blanc', 'Sauvignon Blanc', 
         'Viognier-Chardonnay', 'Catarratto', 'Inzolia', 'Bordeaux-style White Blend', 'Grillo', 
         'Albariño', 'Petit Manseng', 'Vernaccia', 'Grüner Veltliner', 'Viognier', 'Vermentino', 
         'Grenache Blanc', 'Pinot Blanc', 'Alsace white blend', 'Portuguese White', 'Verdejo', 
         'Fumé Blanc', 'Pinot Bianco', 'Ugni Blanc-Colombard', 'Friulano', 'Assyrtico', 'Vignoles', 
         'Muscat', 'Muscadelle', 'Garganega', 'Pinot Grigio','Cortese', 'Melon', 'Vidal', 'Verdelho', 
         'Marsanne', 'Vilana', 'Viura', 'Verduzzo', 'Verdicchio', 'Colombard', 'Sylvaner', 'Sémillon', 
         'Antão Vaz', 'Verdejo-Viura', 'Chenin Blanc-Chardonnay', 'Insolia', 'Ribolla Gialla', 
         'Weissburgunder', 'Traminer', 'Prié Blanc', 'Müller-Thurgau', 'Pansa Blanca', 'Muskat Ottonel',
        'Sauvignon Blanc-Semillon', 'Semillon-Sauvignon Blanc', 'Bical', 'Viura-Chardonnay', 'Malvasia Bianca',
         'Rhône-style White Blend', 'Scheurebe', 'Kerner', 'Carricante', 'Fiano', 'Früburgunder', 'Roussanne', 
         'Avesso', 'Chinuri', 'Muscat Blanc à Petits Grains', 'Xarel-lo', 'Greco', 'Trebbiano', 'Prié Blanc',
        'Falanghina', 'Bical', 'Gelber Muskateller', 'Turbiana', 'Refosco', 'Alvarinho', 'Manzoni', 'Assyrtiko', 
        'Welschriesling', 'Rieslaner', 'Traminette', 'Marsanne-Viognier', 'Gewürztraminer-Riesling', 
        'Austrian white blend', 'Tocai', 'Chardonnay-Viognier', 'Fernão Pires', 'Seyval Blanc', 'Muscat Canelli', 
        'Arinto', 'Arneis', 'Malvasia', 'Altesse', 'Blanc du Bois', 'Provence white blend', 'Nosiola', 
        'Roussanne-Viognier', 'Godello', 'Auxerrois', 'Albana', 'Muskat',  'Grechetto', 'Encruzado', 
        'Garnacha Blanca', 'Pallagrello', 'Morava', 'Aleatico', 'Nascetta', 'Siria', 'Asprinio', 'Feteascǎ Regalǎ', 
        'Tocai Friulano', 'Schiava', 'Chardonnay-Semillon', 'Palomino', 'Norton', 
        'Loureiro-Arinto', 'Symphony', 'Edelzwicker', 'Madeira Blend', 'Gros and Petit Manseng', 'Jacquère', 
        'Chenin Blanc-Sauvignon Blanc', 'Marzemino', 'Chardonnay-Sauvignon Blanc', 'Trebbiano Spoletino',
        'Chasselas', 'Hárslevelü', 'Siegerrebe','Colombard-Sauvignon Blanc', 'Diamond',
        'Gros Manseng', 'Muskateller', 'Aligoté', 'Muscat Blanc', 'Viognier-Roussanne', 'Pallagrello Bianco', 
        'Veltliner', 'Chardonnay-Sauvignon', 'Chenin Blanc-Viognier', 'Vitovska', 'Grauburgunder', 'Macabeo', 
        'Verdil', 'Treixadura', 'Coda di Volpe', 'Viura-Verdejo', 'Bombino Bianco', 'Pinot-Chardonnay', 
        "Muscat d'Alexandrie", 'Chardonnay-Pinot Gris', 'Chardonnay-Pinot Blanc','Piquepoul Blanc', 'Orange Muscat',
        'Ugni Blanc', 'Semillon-Chardonnay', 'Irsai Oliver', 'Greco Bianco', 'Viognier-Grenache Blanc', 'Pignoletto', 
        'Muscatel', 'White Riesling', 'Hondarrabi Zuri', 'Nuragus', 'Xynisteri', 'Sauvignon Musqué', 'Roussanne-Marsanne', 
        'Incrocio Manzoni', 'Terrantez', 'Bual', 'Verdejo-Sauvignon Blanc', 'Malvasia-Viura', 'Savatiano', 
        'Macabeo-Chardonnay', 'Tamjanika', 'Macabeo-Moscatel', 'Códega do Larinho','Pinot Gris-Gewürztraminer',
         'Viosinho', 'Paralleda', 'Malvar', 'Airen', 'Erbaluce', 'Verdosilla', 'Aidani', 'Vinhão', 'Rolle', 'Orangetraube', 
         'Žilavka', 'Portuguiser', 'Gouveio', 'Bombino Nero', 'Malagouzia-Chardonnay', 'Elbling', 'Gragnano', 
         'Pinot Blanc-Chardonnay', 'Petit Meslier', 'Chardonnay Weissburgunder', 'Robola', 'Folle Blanche', 'Malagouzia', 
         'Rabigato', 'Sauvignonasse', 'Meseguera', 'Alvarinho-Chardonnay', 'Pinot Blanc-Viognier', 'Biancu Gentile', 
         'Xinisteri','Moschofilero-Chardonnay','Sauvignon Blanc-Sauvignon Gris', 'Trebbiano di Lugana', 'Verdeca', 
         'Chardonel', 'Silvaner-Traminer', 'Uvalino', 'Merseguera-Sauvignon Blanc', 'Cayuga', 
         'Nasco', 'Vital', 'Apple', 'Pinot Grigio-Sauvignon Blanc', 'Valvin Muscat', 'Malvasia Fina', 
         'Roditis-Moschofilero', 'Premsal', 'Jampal', 'Tokay Pinot Gris', 'Trajadura', 'Roscetto', 'Torontel', 
         'Viognier-Valdiguié',
         'Zierfandler', 'Marsanne-Roussanne', 'Pinot Meunier', 'Muskat Ottonel', 'Moscatel', 'Moschofilero', 'White Port', 
         'Kisi', 'Kangoun', 'Posip', 'Uva di Troia', 'Zierfandler-Rotgipfler', 'Mauzac', 'Pinot Auxerrois', 'Neuburger', 
         'Sämling', 'Rkatsiteli', 'Trousseau Gris', 'Malvasia Istriana', 'Morillon', 'Tokay', 'Gros Plant', 'Muscat Hamburg', 
         'Emir', 'Tsolikouri', 'Narince', 'Grecanico', 'Madeleine Angevine', 'Doña Blanca', 'Graševina', 'Thrapsathiri', 
         'Cococciola', 'Plyto', 'Azal', 'Moscatel Graúdo', 'Malvasia di Candia', 'Maria Gomes', 'Muscat of Alexandria', 
         'Moscatel de Alejandría', 'Misket', 'Tamianka', 'Morio Muskat', 'Sauvignonasse', 
         'Viognier-Marsanne', 'Ryzlink Rýnský', 'Muscadel', 'Roussanne-Grenache Blanc', 'Chancellor', 'Picapoll', 
         'Blauburger', 'Athiri', 'Ondenc','Gewürztraminer', 'Torrontés', 'Furmint', 'Savagnin', 'Glera', 
         'Roter Veltliner', 'Silvaner', 'Ruché', 'Pecorino', 'Sauvignon Gris', 'Vidal Blanc', 'Albanello', 
         'Loureiro', 'Clairette', 'Verduzzo Friulano ', "Loin de l'Oeil", 'Timorasso', 'Pigato', 'Viognier-Gewürztraminer', 
         'Sauvignon Blanc-Chenin Blanc', 'Colombard-Ugni Blanc', 'Mtsvane', 'Rivaner', 'Vespaiolo', 'Biancolella', 
         'Riesling-Chardonnay', 'Maria Gomes-Bical', 'Gelber Traminer', 'Sercial', 'Grenache Gris', 'Chardonnay-Albariño',
          'Roditis', 'Papaskarasi', 'Zibibbo', 'Malagousia', 'Rotgipfler', 'Durella', 'Cercial', 'Johannisberg Riesling', 
          'Teran', 'Mantonico', 'Timorasso', 'Zlahtina', 'Shiraz-Roussanne', 'Tămâioasă Românească', 'Ansonica', 'Feteasca',
        'Catalanesca', 'Moscato di Noto', 'Moscato Giallo','Sauvignon Blanc-Chardonnay', 'Sauvignon-Sémillon', "Cesanese d'Affile", 
        'Sauvignon Blanc-Verdejo', 'Chardonnay-Riesling', 'Sauvignon Blanc-Assyrtiko','Zelen', 'Tempranillo Blanco', 'Roter Traminer'
]
red = ['Portuguese Red', 'Pinot Noir', 'Tempranillo-Merlot', 'Frappato', 'Cabernet Sauvignon',
        'Nerello Mascalese', 'Malbec', 'Tempranillo Blend', 'Meritage', 'Red Blend', 'Merlot', 
        "Nero d'Avola", 'Gamay', 'Primitivo', 'Sangiovese', 'Cabernet Franc', 'Bordeaux-style Red Blend', 
        'Aglianico', 'Petite Sirah', 'Touriga Nacional', 'Carmenère', 'Rosso', 'Shiraz-Cabernet Sauvignon', 
        'Barbera', 'Rhône-style Red Blend', 'Graciano', 'Tannat-Cabernet', 'Sauvignon', 'Sangiovese Grosso', 
        'Bonarda', 'Shiraz', 'Montepulciano', 'Grenache', 'Syrah', 'Nebbiolo', 'Blaufränkisch', 'Carignan-Grenache', 
        'Sagrantino', 'Cabernet Sauvignon-Syrah', 'Tempranillo','Mencía', 'Zweigelt', 'Cannonau', 'Dolcetto', 
        'Garnacha Tintorera', 'Pinot Nero', 'Pinotage', 'Syrah-Grenache', 'Antão Vaz', 'Cabernet Sauvignon-Carmenère', 
        'Tinta Miúda', 'Monastrell', 'Merlot-Malbec', 'Cabernet Sauvignon-Merlot', 'Merlot-Argaman', 'Garnacha', 
        'Negroamaro', 'Mourvèdre', 'Syrah-Cabernet', 'Tannat', 'Cabernet Sauvignon-Sangiovese', 'Austrian Red Blend', 
        'Teroldego', 'Baga','Pinot Noir-Gamay', 'Cinsault', 'Corvina, Rondinella, Molinara', 'Tannat-Syrah', 'Charbono', 
        'Provence red blend', 'Claret','Malbec-Merlot', 'Monastrell-Syrah', 'Malbec-Tannat', 'Malbec-Cabernet Franc', 
        'Tinta de Toro', 'Cabernet Moravia', 'Chambourcin', 'Nero di Troia', 'Cesanese', 'Lagrein', 'Tinta Fina', 'St. Laurent', 
        'Cabernet Sauvignon-Shiraz', 'Syrah-Cabernet Sauvignon', 'Pugnitello', 'Touriga Nacional Blend', 'Tinta Roriz', 
        'Cabernet Franc-Cabernet Sauvignon', 'Grenache-Syrah', 'Tempranillo-Cabernet Sauvignon', 'Merlot-Cabernet Franc', 
        'Syrah-Petite Sirah', 'Cabernet Blend', 'Maturana', 'Magliocco', 'Gamay Noir', 'Spätburgunder', 'Plavac Mali',
        'Lemberger', 'Saperavi', 'Dornfelder', 'Ojaleshi', 'Mondeuse', 'Perricone', 'Syrah-Merlot', 'Cabernet Sauvignon-Malbec',
        'Tinto Fino', 'Malbec-Cabernet Sauvignon','Picpoul','Carignano', 'Cabernet Franc-Merlot', 
        'Syrah-Petit Verdot', 'Syrah-Mourvèdre', 'Shiraz-Grenache', 'Grenache-Carignan', 'Malbec-Syrah', 
        'Cabernet Sauvignon-Tempranillo', 'Carignan', 'Cabernet-Syrah', 'Merlot-Cabernet Sauvignon', 
        'Mourvèdre-Syrah', 'Negrette', 'Tinta Barroca', 'Merlot-Tannat','Castelão', 
         'Grenache Blend', 'Sangiovese Cabernet', 'Touriga Nacional-Cabernet Sauvignon', 'Cabernet Sauvignon-Cabernet Franc', 
         'Baco Noir', 'Tempranillo-Tannat', 'Touriga Franca', 'Barbera-Nebbiolo', 'Prieto Picudo', 'Gaglioppo', 'Carignane', 
         'Tannat-Merlot', 'Nerello Cappuccio', 'Counoise', 'Mazuelo', 'Tinta del Pais', 'Vranec', 'Mavrud', 'Cabernet', 
         'Grenache-Mourvèdre', 'Forcallà', 'Syrah-Tempranillo', 'Cabernet Sauvignon-Barbera', 'Merlot-Cabernet', 'Jaen', 
         'Tinta del Toro', 'Prunelard', 'Garnacha-Syrah', 'Rufete', 'Tempranillo-Shiraz','Mansois',
         'Mataro', 'Tinta Cao', 'Blauer Portugieser', 'Groppello', 'Poulsard', 'Grenache-Shiraz', 'Baga-Touriga Nacional', 
         'Carineña', 'Ciliegiolo', 'Cabernet Sauvignon-Merlot-Shiraz', 'Sciaccerellu', 'Alicante', 'Rosenmuskateller', 
         'Malbec-Cabernet', 'Touriga', 'Carmenère-Syrah', 'Mavroudi', 'Pinot Blanc-Pinot Noir', 'Tinto Velasco', 'Kadarka', 
         'Sangiovese-Syrah', 'Tannat-Cabernet Franc', 'Fer Servadou', 'Mission', 'Kekfrankos', 'Blauburgunder', 'Marquette', 
         'Romorantin', 'Braucol', 'Cabernet Franc-Malbec', 'Pallagrello Nero', 'Rebula', 'Vespolina', 'Shiraz-Malbec', 
         'Rebo', 'Tempranillo-Malbec', 'Trousseau', 'Bacchus', 'Syrah-Malbec', 'Syrah-Cabernet Franc', 'Cariñena-Garnacha', 
         'Sideritis','Rara Neagra', 'Molinara', 'Abouriou', 'Nielluciu', 'Malbec-Bonarda', 'Garnacha-Monastrell', 'Souzao', 
         'Tinta Francisca', 'Malvasia Nera', 'Listán Negro', 'Pinotage-Merlot', 'Jacquez', 'Carignan-Syrah', 'Mavrotragano', 
         'Bovale', 'Frankovka', 'Garnacha Blend', 'Merlot-Shiraz', 'Malbec Blend', 'Merlot-Syrah', 'Babić', 'Yapincak', 
         'Mandilaria', 'Saperavi-Merlot', 'Teroldego Rotaliano', 'Garnacha-Tempranillo','Vermentino Nero',
          'Albarossa', 'Cabernet Sauvignon Grenache', 'Black Monukka', 'Merlot-Grenache', 'Vranac', 'Tempranillo-Syrah', 
          'Boğazkere', 'Tinta Amarela', 'Tinta Negra Mole', 'Chelois', 'Shiraz-Tempranillo', 'Biancale', 'Syrah-Bonarda', 
          'Durif', 'Franconia', 'Malbec-Tempranillo', 'Monastrell-Petit Verdot', 'Sirica', 'Espadeiro', 'Blatina', 'Karalahna', 
          'Garnacha-Cabernet', 'Garnacha-Cariñena', 'Cabernet Franc-Lemberger', 'Shiraz-Mourvèdre', 'Mavrokalavryta', 'Favorita', 
          'Babosa Negro', 'Dafni', 'Petit Courbu', 'Kotsifali', 'Parraleta', 'Otskhanuri Sapere', 'Trollinger', 
          'Tsapournakos', 'Francisa', 'Kuntra', 'Pignolo', 'Schwartzriesling','Sousão', 'Feteasca Neagra', 'Kinali Yapincak',
          'Kalecik Karasi', 'Karasakiz', 'Raboso', 'Trepat', 'Freisa', 'Trincadeira', 'Melnik', 'Argaman', 'Piedirosso', 
          'Marawi', 'Çalkarası', 'Tinta Francisca', 'Vidadillo', 'Other', 'Cabernet Pfeffer', 'Roviello', 'Colorino', 
          'Tinta Madeira', 'Centesimino', 'Ramisco', 'Gamza', 'Bobal-Cabernet Sauvignon',
          'Petit Verdot', 'Zinfandel', 'G-S-M', 'Monica', 'Cabernet Merlot', 'Cabernet Franc-Carmenère', 
          'Grenache Noir', 'Xinomavro', 'Petite Verdot', 'Tempranillo-Garnacha', 'Carmenère-Cabernet Sauvignon', 
          'Sangiovese-Cabernet Sauvignon', 'Shiraz-Cabernet', 'Syrah-Grenache-Viognier', 'Cabernet-Shiraz', 'Syrah-Carignan', 
          'Cabernet-Malbec', 'Merlot-Petite Verdot', 'Duras', 'Aragonês', 'Agiorgitiko', 'Aragonez', 'Alfrocheiro', 'Corvina', 
          'Alicante Bouschet', 'Tinto del Pais', 'Bobal', 'Susumaniello', 'Grolleau', 'Canaiolo', 'Bastardo', 'Tintilia', 
          'St. Vincent', 'Caprettone','Black Muscat','Muscadine','Syrah-Viognier', 'Shiraz-Viognier', 'Carcajolu', 
          'Marselan', 'Malbec-Petit Verdot', 'Grignolino', 'Pinot Noir-Syrah', 'Malbec-Carménère','País', 'Alvarelhão', 
          'Okuzgozu', 'Tintilia','Mavrodaphne','Tintilia ', 
] 
rose = ['Rosé', 'Rosato', 'Rosado','Portuguese Rosé', 'Prugnolo Gentile'] 
sparkling = ['Champagne Blend', 'Prosecco', 'Sparkling Blend','Portuguese Sparkling',
             'Cerceal', 'Lambrusco','Lambrusco di Sorbara','Lambrusco Grasparossa',
              'Torbato', 'Moscadello', 'Passerina', 'Brachetto', 'Ekigaïna', 'Picolit', 
              'Sacy', 'Moscatel Roxo', 'Debit','Moscato', 'Valdiguié', 'Casavecchia', 
              'Lambrusco Salamino', 'Moscato Rosa'] 
fortified = ['Sherry', 'Pedro Ximénez', 'White Port', 'Tokaji','Port']

In [None]:
red_dict = {variety: 'red' for variety in red}
white_dict = {variety: 'white' for variety in white}
rose_dict = {variety: 'rose' for variety in rose}
sparkling_dict = {variety: 'sparkling' for variety in sparkling}
fortified_dict = {variety: 'fortified' for variety in fortified}
wine_dict = {**red_dict, **white_dict, **rose_dict, **sparkling_dict, **fortified_dict}

In [None]:
df['type_faiss'] = df['variety'].map(wine_dict)

In [None]:
print(list(df.loc[df.type.isnull(), 'variety'].unique()))

In [None]:
df.type.value_counts()

In [None]:
df.type_faiss.value_counts()

**Create TSNE Plot**

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Assuming df is your pandas DataFrame
embeddings = np.array(df['embeddings'].tolist())  # Convert embeddings column to a numpy array

# Perform t-SNE with tqdm progress bar
tsne = TSNE(n_components=2, random_state=42)
with tqdm(total=len(embeddings), desc="Running t-SNE") as pbar:
    tsne_results = tsne.fit_transform(embeddings)
    pbar.update(len(embeddings))

In [None]:
# Create a scatter plot with colors based on variet
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], s=0.005) #, c='type', cmap='viridis')
plt.title('Wine t-SNE')
#plt.colorbar()
plt.axis("off")
plt.show()

**Using Sentence Transformers**

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
start = time.perf_counter()
embeddings = model.encode(df.description.to_list(), show_progress_bar=True)
elapsed = time.perf_counter() - start
display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to compute embeddings for ${df.shape[0]:,d}$ reviews.'))
embeddings[0].shape

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = df.description.to_list()

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
start = time.perf_counter()
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
elapsed = time.perf_counter() - start
display(Markdown(f'It took ${elapsed:.0f}$ seconds to tokenize for ${df.shape[0]:,d}$ reviews.'))

# Compute token embeddings
with torch.no_grad():
  start = time.perf_counter()
  model_output = model(**encoded_input)
  elapsed = time.perf_counter() - start
  display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to compute embeddings for ${df.shape[0]:,d}$ reviews.'))

# Perform pooling
start = time.perf_counter()
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
elapsed = time.perf_counter() - start
display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to perform pooling of embeddings for ${df.shape[0]:,d}$ reviews.'))

# Normalize embeddings
start = time.perf_counter()
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
elapsed = time.perf_counter() - start
display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to normalize embeddings for ${df.shape[0]:,d}$ reviews.'))

print("Sentence embeddings:")
sentence_embeddings[0].shape

In [None]:
# Perform t-SNE with tqdm progress bar
tsne = TSNE(n_components=2, random_state=42)
start = time.perf_counter()
tsne_results = tsne.fit_transform(embeddings)
display(Markdown(f'It took ${elapsed/60:.0f}$ minutes to compute t-SNE dimension reductions for ${df.shape[0]:,d}$ reviews.'))
tsne_results[:10]

In [None]:
# Create a scatter plot with colors based on variet
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], s=0.005) #, c='type', cmap='viridis')
plt.title('Wine t-SNE')
#plt.colorbar()
plt.axis("off")
plt.show()