## Mean Pooling

In [None]:
import pandas as pd
import numpy as np

from IPython.display import Markdown, display

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

### Load Data

In [None]:
df = pd.read_parquet('files/wine_review.parquet.gzip')
df.head()

**Define the Corpus to Transform**

In [None]:
# The sentences to encode
sentences = df.description.to_list()
sentences[:5]

### Define Helper Functions

**Helper to Time Various Operations**

In [None]:
import time
from collections import namedtuple

PERF = namedtuple('PERF', ['algo', 'duration'])

timings = []

def timeit(algo, purpose, func, count, items='documents'):
  start = time.perf_counter()
  result = func()
  elapsed = time.perf_counter()-start
  timings.append(PERF(algo, elapsed))
  display(Markdown(f'It took ${elapsed/60:.1f}$ minutes to {purpose} for ${count:,d}$ {items}.'))
  return result

**Helper to Compute Embeddings for a Corpus Given the Name of a Pretrained Model**

In [None]:
from sentence_transformers import SentenceTransformer

def compute_embeddings(corpus, model_name='all-MiniLM-L6-v2'):
  # timing helper
  __t = lambda purpose, func: timeit(model_name, purpose, func, len(corpus), 'reviews')

  # calculate embeddings using a pretrained sentence transformer model
  model = SentenceTransformer(model_name)
  return __t('compute embeddings', lambda: model.encode(corpus, normalize_embeddings=True, show_progress_bar=True, device='cpu'))

**Helper to compute the t-SNE Dimensional Reduction of Embeddings Vector**

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# from joblib import Parallel, delayed, parallel_config

def tsne(embeddings, perplexity=10):
  # timing helper
  __t = lambda purpose, func: timeit('tsne', purpose, func, embeddings.shape[0], 'reviews')

  # perform dimensionsal reduction on features extracted by sentence transformers
  pca = PCA(n_components=50, random_state=42)
  tsne = TSNE(perplexity=perplexity, random_state=42, n_jobs=1)
  return __t('perform t-SNE dimension reduction on embeddings', lambda: tsne.fit_transform(pca.fit_transform(embeddings)))


**Helper to Plot the Result of the Dimensional Reduction**

In [None]:
color_map = dict(
  sparkling='forestgreen',
  white='gold',
  rose='deeppink',
  red='darkred',
  dessert='dodgerblue',
)

def visualize_embeddings(tsne_result, df=df, model_name='all-MiniLM-L6-v2', hue=None):
  # Create a scatter plot with colors based on variet
  sns.scatterplot(x=tsne_result[:, 0], y=tsne_result[:, 1], s=0.35, hue=hue, palette=None if hue is None else color_map) #, c='type', cmap='viridis')
  plt.title(f'2D t-SNE Plot of {model_name} Embeddings')
  if hue is not None:
    plt.legend(title=None, loc='lower center', mode='expand', ncol=df.type.shape[0], frameon=False, fancybox=False, markerscale=10, 
              fontsize='small', bbox_to_anchor=(.0,-0.05,0.9,1), title_fontsize='medium', handletextpad=.45)
  plt.axis("off")
  plt.show()

### Without Mean Pooling

**`all-mpnet-base-v2` Model with 768 dimensions**

In [None]:
# compute embeddings and similarities vectors
mnpet_embeddings = compute_embeddings(sentences, model_name='all-mpnet-base-v2')
mnpet_embeddings.shape

In [None]:
import warnings

with warnings.catch_warnings():
  warnings.filterwarnings(action='ignore', category=RuntimeWarning)
  mnpet_tsne = tsne(mnpet_embeddings)

mnpet_tsne.shape

In [None]:
visualize_embeddings(mnpet_tsne, model_name='all-mpnet-base-v2')

### With Mean Pooling

**Helper Function to Compute Embeddings**

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # Get token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(corpus, model_name='all-MiniLM-L6-v2', tokenizer=None, model=None):
  # Load tokenizer and model
  tokenizer__ = AutoTokenizer.from_pretrained(model_name) if tokenizer is None else tokenizer
  model__ = AutoModel.from_pretrained(model_name) if model is None else model
  device = torch.device('cpu')
  model__.to(device)

  # Tokenize sentences
  encoded_input = tokenizer__(sentences, padding=True, truncation=True, return_tensors='pt', return_attention_mask=True)

  # Get token embeddings
  with torch.no_grad():
      model_output = model__(**encoded_input)

  # Apply mean pooling to get sentence embeddings
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  # Normalize the embeddings
  return F.normalize(sentence_embeddings, p=2, dim=1)


**`all-mpnet-base-v2` Model with 768 dimensions**

In [None]:
sentence_embeddings=get_embeddings(sentences, model_name='all-mpnet-base-v2')
sentence_embeddings.shape

In [None]:
mean_pooled_mnpet_tsne = tsne(sentence_embeddings)
mean_pooled_mnpet_tsne.shape

In [None]:
visualize_embeddings(mean_pooled_mnpet_tsne, model_name='all-mpnet-base-v2')