In [1]:
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import process_text, torch
import py_vncorenlp, os, json, pandas as pd
import constants.constants as default
model_sbert = SentenceTransformer(default.model_sbert_path)

In [2]:


# Choose device
try:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.environ["CUDA_VISIBLE_DEVICES"]='2, 3'
    torch.cuda.empty_cache()
except:
    device = 'cpu'


print('Your device:', device)

print('Load sbert model successfully')
model_vncore = py_vncorenlp.VnCoreNLP( save_dir=default.model_vncorenlp_path,
                                       annotators=["wseg", "pos", "ner", "parse"],
                                       max_heap_size=default.max_heap_size)

os.chdir(default.root_path)
print('Load vncorenlp model successfully')


Your device: cpu
Load sbert model successfully
2023-07-06 22:55:12 INFO  WordSegmenter:24 - Loading Word Segmentation model
Load vncorenlp model successfully
2023-07-06 22:55:12 INFO  PosTagger:23 - Loading POS Tagging model
2023-07-06 22:55:14 INFO  NerRecognizer:34 - Loading NER model
2023-07-06 22:55:24 INFO  DependencyParser:32 - Loading Dependency Parsing model


In [3]:
### process text support
#LOAD TEENCODE
file = open(default.teencodes_path, 'r', encoding="utf8")
teen_lst = file.read().split('\n')
teen_dict = {}
for line in teen_lst:
    key, value = line.split('\t')
    teen_dict[key] = str(value)
file.close()

#LOAD STOPWORDS
# file = open(default.stopwords_path, 'r', encoding="utf8")
# stopwords_lst = file.read().split('\n')
# file.close()

file = open(default.map_languages, "r")
map_langs = json.loads(file.read())
file.close()


In [4]:
df = pd.read_csv('data.csv')
df_content = df['content']
pt = process_text.ProcessText()
df_content.apply(pt.clean_text)
st = process_text.SentenceTransformer_Process(sentences="", device=device, model=model_sbert)
#df_content.apply(st.sentence_transformers_embedding_text)


0      [tensor(-0.0498), tensor(-0.0800), tensor(0.06...
1      [tensor(-0.0521), tensor(0.0733), tensor(0.017...
2      [tensor(-0.0524), tensor(0.0163), tensor(0.015...
3      [tensor(-0.0296), tensor(0.0070), tensor(0.014...
4      [tensor(0.0475), tensor(0.0299), tensor(0.0253...
                             ...                        
469    [tensor(-0.0292), tensor(0.0262), tensor(-0.03...
470    [tensor(-0.0156), tensor(0.0122), tensor(0.032...
471    [tensor(0.0289), tensor(0.0366), tensor(0.0512...
472    [tensor(-0.0121), tensor(-0.0132), tensor(-0.0...
473    [tensor(-0.0098), tensor(0.0235), tensor(0.013...
Name: content, Length: 474, dtype: object

In [5]:
tensors = df_content.apply(st.sentence_transformers_embedding_text)

In [6]:

embbs = [t.numpy() for t in tensors]
# print(embbs)
# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embbs)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(embbs[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

  super()._check_params_vs_input(X, default_n_init=10)


Cluster  1
[array([ 3.15566920e-02,  3.41106318e-02,  1.39279282e-02,  2.02600434e-02,
        4.50557135e-02,  2.92418748e-02, -4.11881171e-02,  3.27831395e-02,
       -1.36541770e-04,  6.87017618e-03,  3.96547653e-02, -2.04004422e-02,
        6.77511096e-02,  5.12621785e-03,  1.20294373e-02, -1.81520116e-02,
       -6.88785268e-03,  2.93687992e-02, -3.51219848e-02,  6.85001072e-03,
       -3.27257812e-02,  2.03253981e-02,  1.92463154e-03, -2.92990617e-02,
        6.02046102e-02,  1.49303414e-02,  4.12535444e-02, -2.36262730e-03,
       -6.54964000e-02, -7.77504826e-03,  3.04483585e-02, -3.13376896e-02,
        3.03399041e-02, -7.35771433e-02,  1.17736906e-01,  2.34746523e-02,
        1.42373424e-02,  7.99368918e-02,  1.21297976e-02,  2.76363641e-02,
       -3.38197649e-02,  2.37934645e-02,  2.82578748e-02,  7.31434673e-02,
       -2.03651004e-02,  3.27300206e-02,  7.09373504e-02, -3.35285394e-03,
       -3.87891605e-02,  1.51474671e-02, -2.85384115e-02, -8.87934677e-03,
        4.528

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

