In [1]:
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

In [2]:
import sys
sys.path.append('../')

In [3]:
from shared.utils.re_utils import remove_parentheses, remove_apostrophes

from shared.utils.processing.embedding.text import TextGroupProcessing

## Loading Data

In [4]:
df = pd.read_csv(
    '../shared/data/amz_products_small_pre_processed.csv.gz', 
    compression='gzip'
)

## Text Exploration

We can explore a bit which are the results we would get after processing text (description, title & features).

### Description

We could try to watch some descriptions and how we can convert them into tokens.

In [5]:
description_values = df.description.values

In [6]:
for idx in range(150):
    value = remove_parentheses(description_values[idx])
    value = remove_apostrophes(value)
    value = value.split('.,')
    value = [v.strip() for v in value]
    if len(value) > 1 and value[1]:
        print(idx)

17
30
35
51
53
55
59
74
75
77
86
95
122
131
142


In [7]:
description_value = description_values[142]
description_value

'(\'The Walker Mega-Clamp combines state-of-the-art Walker engineering with quality manufacturing. Available in a wide range of styles, sizes, and materials, including stainless steel and aluminized steel. Walker has a Mega-Clamp to meet your heavy-duty needs.\', \'<a class="a-link-normal" target="_blank" rel="noopener" href="https://images-na.ssl-images-amazon.com/images/G/01/aplusautomation/vendorimages/6c516948-ce04-4753-87cc-5a028481ec69.jpg._CB311883954_.jpg">\\n            <img alt="DynoMax Performance Exhaust, DynoMax Performance Mufflers, DynoMax Super Turbo" src="https://images-na.ssl-images-amazon.com/images/G/01/aplusautomation/vendorimages/6c516948-ce04-4753-87cc-5a028481ec69.jpg._CB311883954__SL300__.jpg" class="a-spacing-mini">\\n        </a>\\n    \\n    \\n\\n\\n                            <br/>\\n                            \\n\\n\\n\\n\\n\\n\\n\\n\\n    \\n    \\n        \\n    \\n\\n\\n\\n\\n\\n    \\n        <div class="a-text-center">\\n            <a class="a-link

In [8]:
description_processed = TextGroupProcessing(description_value).get_text_group_processed()
description_processed

['walker megaclamp combines stateoftheart walker engineering quality manufacturing available wide range styles sizes materials including stainless steel aluminized steel walker megaclamp meet heavyduty needs',
 'view larger view larger view larger dynomax super turbo performance mufflers exclusive patented flow director design channels exhaust flow eliminates turbulence large internal flow tubes improve exhaust flow reduce backpressure muffler uses fiberglass matting technology absorb unwanted interior resonance maintaining mellow performance tone',
 'pure unadulterated power dynomax super turbo mufflers backed limited lifetime warranty exclusive performance sound guarantee',
 'dynomax mufflers offer revolutionary highflow straightthrough stainless steel performance muffler controls irritating drone ensuring maximum performance',
 'provide dronefree performance dynomax valve technology mufflers feature exclusive precisely calibrated internal valve redirects exhaust flow cruising condit

### Title

In [9]:
title_values = df.title.values
title_values

array(['NSI - A Day Without Sunshine is Like, Well, Night! - Bumper Sticker',
       'Genuine GM Parts 10341533 Rear Bumper Valance Panel',
       ' JLM HID Conversion Kit H13 (9008) Dual Tube BI-XENON 6000K 4 SLIM Ballasts (Diamond White)',
       ..., 'MFE INTERNET SECURITY 1PC 2014',
       'YBS Nuance Dragon Naturally Speaking Premium 13.0 Upgrade from Premium 11 and 12 - Upgrade Only',
       '1YR KIS 2016 3DT COM LICS+LTD'], dtype=object)

In [10]:
title_value = title_values[1]
title_value

'Genuine GM Parts 10341533 Rear Bumper Valance Panel'

In [11]:
TextGroupProcessing(title_value).get_text_group_processed()

['genuine parts rear bumper valance panel']

### Features

We can use the same processing for the text of descriptions to the features.

In [12]:
feature_values = df.feature.values

In [13]:
feature_values

array(["('Official Licensed Die-Cut Sticker Designed by NSI', 'Premium Decal is Great for All Surfaces; Skateboards, Windows, Walls, Desks, Doors, Automobiles, Motorcycles, Bumpers, Computers, Laptops, Tablets, Phones, Guitars & Etc.', 'Extra Durable, Remarkable Adhesive Long Lasting Vinyl Die-Cut Screen Print Sticker, Can Withstand Any Indoor/Outdoor Weather, Water Protected, UV Coated', 'Unique Gift Idea Suitable for Fans & Others', 'Individual Protective Packaging.')",
       "('This is the official Genuine General Motors Parts replacement part for your vehicle.',)",
       "('Will run for approx 2500 hours', 'Produces 2 to 3 times as much light as a halogen lamp', 'German advanced technology')",
       ...,
       "('A quality product by MCAFEE, INC.', 'A quality product by MCAFEE, INC.', 'A quality product by MCAFEE, INC.', 'A quality product by MCAFEE, INC.', 'A quality product by MCAFEE, INC.')",
       "('Nuance Dragon Naturally Speaking Premium 13.0 Upgrade from Premium 11 and

In [14]:
feature_value = feature_values[0]
feature_value

"('Official Licensed Die-Cut Sticker Designed by NSI', 'Premium Decal is Great for All Surfaces; Skateboards, Windows, Walls, Desks, Doors, Automobiles, Motorcycles, Bumpers, Computers, Laptops, Tablets, Phones, Guitars & Etc.', 'Extra Durable, Remarkable Adhesive Long Lasting Vinyl Die-Cut Screen Print Sticker, Can Withstand Any Indoor/Outdoor Weather, Water Protected, UV Coated', 'Unique Gift Idea Suitable for Fans & Others', 'Individual Protective Packaging.')"

In [15]:
feature_processed = TextGroupProcessing(feature_value).get_text_group_processed()
feature_processed

['official licensed diecut sticker designed nsi premium decal great surfaces skateboards windows walls desks doors automobiles motorcycles bumpers computers laptops tablets phones guitars etc',
 'extra durable remarkable adhesive long lasting vinyl diecut screen print sticker withstand indooroutdoor weather water protected coated unique gift idea suitable fans others individual protective packaging']

## Text Embeddings

To effectively be able to use this word tokens, we are going to use a pre-trained model from HuggingFace to get the embeddings.

We will use these different models:
- [msmarco-distilbert-base-tas-b](https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b): for the **description**. We will get a 768 dimensional vector space (200M of size).
- [paraphrase-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2): for **title** & **features**. We will get a 384 dimensional dense vector (90M of size).

In [16]:
from shared.model.features.embedding.text import DescriptionEmbedding, FeatureEmbedding

How we are going to handle when we have multiple descriptions? 

We are going to apply a **average pooling layer**.

In [17]:
description_value

'(\'The Walker Mega-Clamp combines state-of-the-art Walker engineering with quality manufacturing. Available in a wide range of styles, sizes, and materials, including stainless steel and aluminized steel. Walker has a Mega-Clamp to meet your heavy-duty needs.\', \'<a class="a-link-normal" target="_blank" rel="noopener" href="https://images-na.ssl-images-amazon.com/images/G/01/aplusautomation/vendorimages/6c516948-ce04-4753-87cc-5a028481ec69.jpg._CB311883954_.jpg">\\n            <img alt="DynoMax Performance Exhaust, DynoMax Performance Mufflers, DynoMax Super Turbo" src="https://images-na.ssl-images-amazon.com/images/G/01/aplusautomation/vendorimages/6c516948-ce04-4753-87cc-5a028481ec69.jpg._CB311883954__SL300__.jpg" class="a-spacing-mini">\\n        </a>\\n    \\n    \\n\\n\\n                            <br/>\\n                            \\n\\n\\n\\n\\n\\n\\n\\n\\n    \\n    \\n        \\n    \\n\\n\\n\\n\\n\\n    \\n        <div class="a-text-center">\\n            <a class="a-link

In [18]:
description_embedding = DescriptionEmbedding()
texts_embedding = description_embedding.get_text_group_embedding(description_value)

In [22]:
texts_embedding

tensor([-3.6675e-02, -9.6275e-02,  2.4945e-01,  9.3478e-02,  2.4204e-01,
         7.2724e-02, -2.4135e-01, -2.8296e-01, -2.5687e-01,  2.3430e-01,
         3.2905e-02,  2.7275e-01, -1.0877e-01, -1.5055e-01,  3.1442e-01,
         9.6190e-02,  2.1163e-01,  1.0992e-01, -3.4900e-01, -2.6917e-01,
         3.9599e-01, -6.8156e-01,  8.3151e-02,  3.3678e-01, -5.9381e-02,
         5.7098e-01,  2.7844e-01, -8.4458e-02, -1.1022e-01,  3.2266e-01,
         1.8216e-01,  1.5903e-01,  1.0519e-01, -1.1145e-01,  1.3594e-01,
         2.2130e-01,  1.8322e-01,  8.2998e-02,  1.9304e-01,  1.8475e-01,
        -3.6797e-01, -2.4001e-01, -1.4896e-01,  9.8006e-02, -1.2300e-01,
         7.9132e-02, -3.5379e-01,  2.6480e-01,  1.1052e-01,  1.2949e-01,
        -1.8417e-01,  1.7968e-02, -3.1764e-01,  2.9894e-01, -1.7790e-02,
         1.2218e-01,  1.5186e-01,  1.8122e-02,  8.3302e-02, -1.2054e-01,
        -3.8600e-01, -2.3516e-01, -1.2830e-01,  1.5319e-01, -1.4141e-01,
         7.2345e-02, -2.4167e-01, -1.0738e-02, -3.0

Can also see with the feature values

In [23]:
feature_embedding = FeatureEmbedding()
texts_embedding = feature_embedding.get_text_group_embedding(feature_value)

In [24]:
texts_embedding

tensor([ 7.7606e-02,  1.8783e-01, -6.4747e-02, -2.1794e-01,  1.1396e-01,
         1.4824e-01, -9.0422e-02,  9.8832e-02, -1.7791e-01, -1.0582e-01,
        -2.3069e-02, -2.2108e-01,  1.6722e-02, -1.2601e-01,  3.1168e-02,
        -7.1709e-02,  1.7809e-02, -1.1976e-01, -4.8943e-02,  2.1181e-01,
         2.7077e-01,  2.7647e-02,  8.1773e-02, -8.3486e-02, -4.7658e-01,
         2.3219e-01, -1.4663e-01,  4.2658e-03, -2.0219e-01, -1.0072e+00,
         1.4415e-01, -1.8735e-01,  2.9242e-01,  2.6916e-01,  4.0289e-01,
        -4.0014e-01, -1.9896e-01, -6.2011e-02, -2.1049e-01, -5.0244e-01,
         4.3351e-02,  2.4406e-01, -1.2962e-01,  3.8697e-01,  2.0313e-01,
         4.8679e-02, -2.0616e-01, -2.3819e-01,  3.2379e-02, -2.5363e-01,
         2.9413e-01, -1.9683e-02,  1.7526e-01, -1.5257e-01, -8.4521e-02,
         4.6180e-02,  3.0868e-03,  6.4979e-02,  1.3617e-01, -1.0620e-02,
        -2.8229e-01, -4.8800e-01, -5.9306e-01,  3.1397e-02,  1.2631e-02,
         2.7645e-01,  9.7091e-03,  1.7793e-01, -1.8