In [None]:
!pip install transformers
!pip install transformers[sentencepiece]
!pip install -U sentence-transformers
!pip install datasets
!pip install faiss-cpu faiss-gpu

In [None]:
pip install pandas numpy streamlit transformers transformers[sentencepiece] -U sentence-transformers datasets faiss-cpu faiss-gpu

In [9]:
import transformers
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from datasets import load_dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read in data

In [4]:
folder_path = Path('/content/drive/MyDrive/wine_recommender/')
raw_data_path = folder_path / 'winemag-data-130k-v2.csv'

In [5]:
df = pd.read_csv(raw_data_path)

In [6]:
df.isnull().sum()

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [7]:
df.description[2]

'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.'

In [13]:
os.listdir("/content/drive/MyDrive/wine_recommender/")

['winemag-data-130k-v2.csv', 'wine_embeddings.csv', 'wine_embeddings.hf']

In [139]:
# Load dataset with embeddings if already created
from datasets import load_dataset, load_from_disk
ds_path = "/content/drive/MyDrive/wine_recommender/wine_embeddings.hf"
new_ds_path = "/content/drive/MyDrive/wine_recommender/wine_ds.hf"
embeddings_dataset = load_from_disk(new_ds_path)
embeddings_dataset

csv_path = "/content/drive/MyDrive/wine_recommender/wine_embeddings.csv"
ds = load_dataset('csv', data_files=csv_path)
ds['train']

#ds_path = "/content/drive/MyDrive/wine_recommender/wine_embeddings.hf"
#df = pd.read_csv(csv_path)
# ds['train'].save_to_disk(ds_path)



  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery', 'embeddings'],
    num_rows: 129971
})

In [140]:
# Remove unecessary columns 
columns_to_remove = ['Unnamed: 0', 'region_2', 'taster_name', 'taster_twitter_handle']
embeddings_dataset = embeddings_dataset.remove_columns(columns_to_remove)

## Create embeddings with HF

In [72]:
from datasets import load_dataset
csv_path = '/content/drive/MyDrive/wine_recommender/winemag-data-130k-v2.csv'
wine_dataset = load_dataset("csv", data_files=csv_path)
wine_dataset = wine_dataset['train']
wine_dataset



  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery'],
    num_rows: 129971
})

In [73]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load model from HuggingFace Hub
model_ckpt = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

# Use GPU to speed up embedding process
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [74]:
# Pool token embeddings 

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Function to get the embeddings from wine description input 
def get_embeddings(text_input):

    # Tokenize sentences
    encoded_input = tokenizer(text_input, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    text_embeddings = F.normalize(text_embeddings, p=2, dim=1)

    return text_embeddings 


In [75]:
test_input = df.description[0]
test_input

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [76]:
get_embeddings(test_input).detach().cpu().numpy().shape

(1, 768)

In [77]:
# Now create new embeddings column for entire dataset
embeddings_dataset = wine_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['description'])
    .detach().cpu().numpy()})

Map:   0%|          | 0/129971 [00:00<?, ? examples/s]

In [100]:
embeddings_dataset

Dataset({
    features: ['Unnamed: 0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery', 'embeddings'],
    num_rows: 129971
})

In [91]:
embeddings_dataset = embeddings_dataset.with_format("np")

In [101]:
embeddings_dataset[1]['embeddings'].shape

(1, 768)

In [106]:
# Reformat so can work with FAISS
def process_embeddings(example):
    example['embeddings'] = np.squeeze(example['embeddings']).astype(np.float32)
    return example

embeddings_dataset = embeddings_dataset.map(process_embeddings)

embeddings_dataset[1]['embeddings'].shape

Map:   0%|          | 0/129971 [00:00<?, ? examples/s]

(768,)

In [138]:
columns_to_remove = ['Unnamed: 0', 'region_2', 'taster_name', 'taster_twitter_handle']
embeddings_dataset = embeddings_dataset.remove_columns(columns_to_remove)
embeddings_dataset

AttributeError: ignored

In [137]:
embeddings_dataset = embeddings_dataset.drop_index("embeddings")
ds_path = "/content/drive/MyDrive/wine_recommender/wine_embeddings.hf"
new_ds_path = "/content/drive/MyDrive/wine_recommender/wine_ds.hf"
embeddings_dataset.save_to_disk(new_ds_path)

AttributeError: ignored

## Similarity search with FAISS

In [143]:
print(embeddings_dataset['embeddings'][0].shape)  # shape of an individual sentence embedding
print(len(embeddings_dataset['embeddings']))  # number of sentence embeddings in the "embeddings" column

(768,)
129971


In [144]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/130 [00:00<?, ?it/s]

Dataset({
    features: ['country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'title', 'variety', 'winery', 'embeddings'],
    num_rows: 129971
})

In [145]:

test_embedding = embeddings_dataset['embeddings'][132]

scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", test_embedding, k=10)

In [146]:
samples['title']

array(['Delheim 2001 Grand Reserve Cabernet Sauvignon (Simonsberg-Stellenbosch)',
       'Château Margaux 2007 Pavillon Rouge de Château Margaux  (Margaux)',
       'Harrison 1997 Reserve Cabernet Sauvignon (Napa Valley)',
       'Rosedale Wines 2009 Cat Amongst the Pigeons Cat Walk Cabernet Sauvignon (Barossa)',
       'Rosedale Wines 2009 Cat Amongst the Pigeons Cat Walk Cabernet Sauvignon (Barossa)',
       'Doubleback 2013 Cabernet Sauvignon (Walla Walla Valley (WA))',
       'Paschal 2000 Quartet Red Wine Red (Applegate Valley)',
       'Township 7 2013 Reserve 7 Red (Okanagan Valley)',
       'Soos Creek 2009 Ciel du Cheval Vineyard Red (Red Mountain)',
       'Hightower 2010 Red Mountain Reserve Red (Red Mountain)'],
      dtype='<U81')

In [147]:
for d in samples['description']:
    print(d)

A Cabernet-dominated (98%) wine, which boasts a dense, concentrated tannic structure. Flavors of cassis underly this, while the wood spices contribute their ribbon of flavor. This is an impressive wine, the flagship red from this estate founded by Spatz Sperling over 30 years ago.
A wine that has richness, with acidity, layers of spice and black currant fruit flavors. The dominant cassis gives elegance, and already a forward balance of fruit and wood.
Simply a spectacular wine; a Napa prototype for fabulous Cabernet. Cassis, tobacco and cedar commingle on the nose, followed by luscious cassis fruit that's backed up by exemplary barrel notes—both in flavor and in texture. So smooth and deft, it's the epitome of class. Give it about five years in the cellar and then indulge.
A fruit-driven Cabernet, with cheerful, bright cherry-berry flavors. Tannins are firm but ripe, accenting the cassis-tinged close with a slightly dusty texture.
A fruit-driven Cabernet, with cheerful, bright cherry-b

In [148]:
scores

array([0.        , 0.39104724, 0.4409023 , 0.44146395, 0.44146395,
       0.4596101 , 0.48264754, 0.48468435, 0.48696655, 0.48761922],
      dtype=float32)

In [149]:
faiss_index_path = "/content/drive/MyDrive/wine_recommender/wine_faiss_index.faiss" 
embeddings_dataset.save_faiss_index('embeddings', faiss_index_path)

In [None]:
# When reloading
from datasets import load_dataset, load_from_disk
ds_path = "/content/drive/MyDrive/wine_recommender/wine_embeddings.hf"
faiss_index_path = "/content/drive/MyDrive/wine_recommender/wine_faiss_index.faiss" 

ds = load_from_disk(ds_path)
ds.load_faiss_index('embeddings', faiss_index_path)

In [114]:
#samples_df = pd.DataFrame.from_dict(samples)
#samples_df["scores"] = scores
#samples_df.sort_values("scores", ascending=False, inplace=True)

ValueError: ignored