In [1]:
%env TOKENIZERS_PARALLELISM=true
! pip install sentence-transformers

env: TOKENIZERS_PARALLELISM=true
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=61f5231c470cb9a73bc8008a23f842d92e25ef9c2395e92b89d3b184dc3f1e16
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [2]:
import pandas as pd
stop_words_df = pd.read_csv(
    header=None,
    filepath_or_buffer='https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK\'s%2520list%2520of%2520english%2520stopwords')
stopwords = stop_words_df[0].tolist() + []

In [3]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/shoppersentiments/TeePublic_review.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,reviewer_id,store_location,latitude,longitude,date,month,year,title,review,review-label
0,0.0,US,37.09024,-95.712891,2023,6,2015 00:00:00,Great help with lost order,I had an order that was lost in transit. When ...,5
1,1.0,US,37.09024,-95.712891,2023,6,2024 00:00:00,I ordered the wrong size tee and hadï¿½ï¿½ï¿½,I ordered the wrong size tee and had difficult...,5
2,2.0,US,37.09024,-95.712891,2023,6,2017 00:00:00,These guys offer the best customerï¿½ï¿½ï¿½,These guys offer the best customer service in ...,5
3,3.0,US,37.09024,-95.712891,2023,6,2024 00:00:00,Good Stuff,Looked for an obscure phrase on a shirt. Teepu...,5
4,4.0,CA,56.130366,-106.346771,2023,6,2023 00:00:00,My order arrived in a good timelyï¿½ï¿½ï¿½,My order arrived in a good timely fashion & th...,4


In [4]:
chars = ['ï', '¿', '½', 'ý', '_', '~', '='] 
def clean(arg: str) -> str:
    result = arg
    for char in chars:
        result = result.replace(char, ' ')
    result = ' '.join(result.split())
    return result

# we only want the text, and we want a small sample for performance reasons
review_df = df[['review', 'review-label']].copy().drop_duplicates(ignore_index=True).dropna().sample(n=5000, random_state=2023)
sentences = review_df['review'].values.tolist()
sentences = [clean(item) for item in sentences]

review_df.shape

(5000, 2)

In [5]:
from collections import Counter
character_keep = set(list('1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'))
character_count = Counter(list(' '.join(sentences)))
characters = [item for item in dict(character_count).keys() if item not in character_keep]
print(characters)

[' ', '.', "'", '%', '!', ',', '-', '"', '?', '/', '$', '+', '&', ';', '(', ')', ':', '<', '*', '[', ']', '@', '#']


In [6]:
from plotly.express import pie
pie(data_frame=review_df['review-label'].value_counts().to_frame().reset_index(), names='review-label', values='count')

Three quarters of our reviews are positive.

In [7]:
from plotly.express import histogram
review_df['token count'] = review_df['review'].apply(func=lambda x: len(x.split()))
histogram(data_frame=review_df, x='token count', log_y=True)

In [8]:
from arrow import now
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

embedding_start = now()
model_name = 'distilbert-base-nli-mean-tokens'
model = SentenceTransformer(model_name_or_path=model_name)
model.max_seq_length = 256
print('{}: max sequence length: {}'.format(now().time(), model.max_seq_length))
print('{}: loaded model: {}'.format(now().time(), model_name))

embedding = model.encode(sentences=sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
# normalize the embeddings
embedding = embedding / norm(embedding, axis=1, keepdims=True)

print('{}: got all embeddings'.format(now().time()))
print('embedding time: {}'.format(now() - embedding_start))

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

18:49:14.740582: max sequence length: 256
18:49:14.740864: loaded model: distilbert-base-nli-mean-tokens


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

18:51:50.640636: got all embeddings
embedding time: 0:02:47.670224


In [9]:
from arrow import now
from sentence_transformers.util import community_detection
from sklearn.cluster import AgglomerativeClustering

cluster_start = now()
clustering_choices = ['agglomerative', 'community']
clustering = clustering_choices[1]

if clustering == clustering_choices[0]:
    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
    clustering_model.fit(X=embedding)
    clusters = clustering_model.labels_
else:
    clusters = community_detection(embeddings=embedding, min_community_size=25, threshold=0.75)

print('cluster time: {}'.format(now() - cluster_start))

cluster time: 0:00:13.913327


In [10]:
from pandas import DataFrame
from plotly.express import scatter
from umap import UMAP
umap_start = now()
init = ['pca', 'random', 'spectral'][0]
umap_model = UMAP(n_components=2, random_state=2023, verbose=1, init=init, n_jobs=1)
umap_df = DataFrame(data=umap_model.fit_transform(X=embedding,), columns=['u0', 'u1',])
umap_df['text'] = sentences
umap_df['rating'] = review_df['review-label'].values.tolist()
cluster = [0] * len(review_df)
for index, current in enumerate(clusters):
    for row in current:
        cluster[row] = index + 1
umap_df['cluster'] = cluster
print('UMAP time: {}'.format(now() - umap_start))

UMAP(init='pca', n_jobs=1, random_state=2023, verbose=1)
Mon Dec 11 18:52:32 2023 Construct fuzzy simplicial set
Mon Dec 11 18:52:32 2023 Finding Nearest Neighbors
Mon Dec 11 18:52:32 2023 Building RP forest with 9 trees
Mon Dec 11 18:52:37 2023 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
	 5  /  12
	Stopping threshold met -- exiting after 5 iterations
Mon Dec 11 18:52:54 2023 Finished Nearest Neighbor Search
Mon Dec 11 18:52:57 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Dec 11 18:53:07 2023 Finished embedding
UMAP time: 0:00:35.317247


In [11]:
from plotly.express import histogram
histogram(data_frame=umap_df, x='cluster', log_y=True)

In [12]:
scatter(data_frame=umap_df[umap_df['cluster'] > 0], x='u0', y='u1', color='cluster', hover_name='text', height=900,
        marginal_x='histogram', marginal_y='histogram', title= 'Cluster > 0')

In [13]:
scatter(data_frame=umap_df, x='u0', y='u1', color='rating', hover_name='text', height=900,
        marginal_x='histogram', marginal_y='histogram', title = 'Rating cluster')

It looks like the model can separate the 5 ratings from the others.

In [14]:
scatter(data_frame=umap_df[umap_df['rating'] != 5], x='u0', y='u1', color='rating', hover_name='text', height=900,
        marginal_x='histogram', marginal_y='histogram', title= 'Rating < 5')

In [15]:
from plotly.express import scatter
from plotly.express.colors import qualitative

from pandas import DataFrame
def plot_all_terms(input_df: DataFrame, terms: list):
    work_df = input_df.copy()
    work_df['found'] = work_df['text'].apply(func=lambda x: '/'.join([term for term in terms if term.lower() in x.lower()]))
    work_df['found'] = work_df['found'].apply(func=lambda x: 'none' if x == '' else x)
    scatter(data_frame=work_df, x='u0', y='u1', color='found', hover_name='text', 
            hover_data=['rating'],
            color_discrete_sequence=qualitative.Alphabet,
           ).show()

plot_all_terms(input_df=umap_df, terms=[ 'shirt'])