# NLP Feature Extraction

### Load Subset of Clean Wine Reviews

See [data preparation](wine_review-data_preparation.ipynb) for details on the prepared dataset.

Libraries

In [41]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
tokens = pd.read_parquet('data/wine/wine_review-tokens.parquet.gzip').tokens
tokens.info()
tokens.head()

<class 'pandas.core.series.Series'>
Index: 100538 entries, 0 to 129970
Series name: tokens
Non-Null Count   Dtype 
--------------   ----- 
100538 non-null  object
dtypes: object(1)
memory usage: 1.5+ MB


0    aroma include tropical fruit broom brimstone d...
1    do ripe fruity wine smooth structure firm tann...
2    rainstorm tart snappy flavor lime flesh rind d...
3    pineapple rind lemon pith orange blossom start...
4    vintner like regular bottling come rough tanni...
Name: tokens, dtype: object

### Feature Extraction

In [12]:
tokens.head().values[0]

'aroma include tropical fruit broom brimstone dry herb palate overly expressive offer unripened apple citrus dry sage alongside brisk acidity'

**Bag-of-words** count occurrence of words

In [34]:
cvect = CountVectorizer(min_df=.003, max_df=.80)
dtm = cvect.fit_transform(tokens)
bow = pd.DataFrame(dtm.toarray(), columns = cvect.get_feature_names_out(), index=tokens.index)
bow.info()
bow.head()

<class 'pandas.core.frame.DataFrame'>
Index: 100538 entries, 0 to 129970
Columns: 942 entries, accent to zesty
dtypes: int64(942)
memory usage: 723.3 MB


Unnamed: 0,accent,accessible,accompany,acid,acidic,acidity,acre,add,addition,additional,...,wrap,year,yeast,yeasty,yellow,yield,young,youthful,zest,zesty
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**TF-IDF** importance of token in document relative to corpus

In [33]:
cvect = TfidfVectorizer(min_df=.003, max_df=.80)
dtm = cvect.fit_transform(tokens)
tfidf = pd.DataFrame(dtm.toarray(), columns = cvect.get_feature_names_out(), index=tokens.index)
tfidf.info()
tfidf.head()

<class 'pandas.core.frame.DataFrame'>
Index: 100538 entries, 0 to 129970
Columns: 942 entries, accent to zesty
dtypes: float64(942)
memory usage: 723.3 MB


Unnamed: 0,accent,accessible,accompany,acid,acidic,acidity,acre,add,addition,additional,...,wrap,year,yeast,yeasty,yellow,yield,young,youthful,zest,zesty
0,0.0,0.0,0.0,0.0,0.0,0.136429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.154443,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.128762,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Visualize the Feature Matrices

**Helper to Time Various Operations**

In [38]:
import time
from collections import namedtuple

PERF = namedtuple('PERF', ['algo', 'duration'])

timings = []

def timeit(algo, purpose, func, count, items='documents'):
  start = time.perf_counter()
  result = func()
  elapsed = time.perf_counter()-start
  timings.append(PERF(algo, elapsed))
  display(Markdown(f'It took ${elapsed/60:.1f}$ minutes to {purpose} for ${count:,d}$ {items}.'))
  return result

**Helper to Compute the t-SNE Dimension Reduction to 2D**

In [39]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
# from joblib import Parallel, delayed, parallel_config

def tsne(features, perplexity=10):
  # timing helper
  __t = lambda purpose, func: timeit('tsne', purpose, func, features.shape[0], 'reviews')

  # perform dimensionsal reduction on features extracted by sentence transformers
  pca = PCA(n_components=50, random_state=42)
  tsne = TSNE(perplexity=perplexity, random_state=42, n_jobs=1)
  return __t('perform t-SNE dimension reduction on feature matrix', lambda: tsne.fit_transform(pca.fit_transform(features)))


**Helper to Plot the t-SNE Projections**

In [None]:
color_map = dict(
  sparkling='forestgreen',
  white='gold',
  rose='deeppink',
  red='darkred',
  dessert='dodgerblue',
)

def plot_tsne(models, embeddings, tsne_projections, hue=None):
  # Create a scatter plot with colors based on variety
  r,c = 1,5
  fig, ax = plt.subplots(r,c, figsize=(5*c,5*r))
  for i, tsne_data in enumerate(tsne_projections):
    __ax = ax[int(i/c)][i%c] if r > 1 else ax[i]
    sns.scatterplot(x=tsne_data.iloc[:, 0], y=tsne_data.iloc[:, 1], s=0.35, ax=__ax, hue=hue, palette=None if hue is None else color_map)
    __ax.set_title(f'{models[i]}: {embeddings[i].shape[1]:,d} dimensions', fontsize='x-large')
    __ax.axis('off')
    if hue is not None:
      __ax.legend().set_visible(False)
    #   __ax.legend(title=None, loc='lower center', mode='expand', ncol=df.type.shape[0], frameon=False, fancybox=False, markerscale=10, 
    #             fontsize='medium', bbox_to_anchor=(.0,-0.05,0.9,1), title_fontsize='medium', handletextpad=.45)
  
  if hue is not None:
    handles, labels = plt.gca().get_legend_handles_labels()
    fig.legend(handles, labels, title=None, loc='lower center', mode='expand', ncol=len(color_map), frameon=False, fancybox=False, markerscale=20, 
                  fontsize='x-large', bbox_to_anchor=(.3,-0.05,0.4,1), title_fontsize='x-large', handletextpad=.45)
  fig.suptitle('2D t-SNE Projections of Embeddings from Multiple Sentence Transformers on Same Corpus' + ('' if hue is None else ' with Color Indicating Type of Wine'), fontsize='xx-large')
  plt.tight_layout()
  plt.show()

**Perform Dimensional Reduction to Visualize the Feature Matrices**

In [42]:
bow_tsne = tsne(bow)
bow_tsne.shape

It took $5.8$ minutes to perform t-SNE dimension reduction on feature matrix for $100,538$ reviews.

(100538, 2)

In [43]:
tfidf_tsne = tsne(tfidf)
tfidf_tsne.shape

It took $4.7$ minutes to perform t-SNE dimension reduction on feature matrix for $100,538$ reviews.

(100538, 2)

### Save the Feature Matrices

In [35]:
## helper function to save file
save = lambda fname, df: df.to_parquet(f'data/wine/wine_review-{fname}.parquet.gzip', compression='gzip')

In [36]:
# save bag-of-words feature matrix
save('bow', bow)

In [37]:
# save TF-IDF feature matrix
save('tfidf', tfidf)

# Next
- [train initial model](wine_review-baseline_model.ipynb)