In [None]:
!pip install -U sentence-transformers
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import seaborn as sns
import altair as alt

In [None]:
products = pd.read_csv("/content/Produktkatalog.csv", error_bad_lines=False)

In [None]:
products['KEYWORD'] = products['KEYWORD_LIST'].apply(lambda x: 'Y' if 'LED' in x else 'N')

In [None]:
products.head()

In [None]:
products.shape

In [None]:
# Define a list with sentences (1k - 100k sentences)
train_sentences = products['DESCRIPTTION_LONG']

In [None]:
# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

In [None]:
# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-german-cased'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

In [None]:
# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

In [None]:
model

In [None]:
model.save('output/productcat-tsdae-model')

In [None]:
model.encode(train_sentences[42])

In [None]:
embeddings = model.encode(train_sentences)

In [None]:
embeddings

In [None]:
pca = PCA(n_components=2)
pca.fit(embeddings)
embeddings_pcs = pca.transform(embeddings)

In [None]:
embeddings_pcs

In [None]:
embeddings_pcs_df = pd.DataFrame(embeddings_pcs)

In [None]:
products['PC1'] = embeddings_pcs_df[0]
products['PC2'] = embeddings_pcs_df[1]

In [None]:
embeddings_tsne = TSNE(n_components=2, perplexity=30, init='pca', n_jobs=-1).fit_transform(embeddings)

In [None]:
embeddings_tsne

In [None]:
embeddings_tsne_df = pd.DataFrame(embeddings_tsne)

In [None]:
products['TSNE1'] = embeddings_tsne_df[0]
products['TSNE2'] = embeddings_tsne_df[1]

In [None]:
products.head()

In [None]:
alt.Chart(products.sample(5000)).mark_circle(size=60).encode(
    x='PC1',
    y='PC2',
    color='KEYWORD',
    tooltip=['DESCRIPTTION_LONG']
).configure_mark(
    opacity=0.3,
    color='red'
).interactive()