# Text clustering with hot terms and transfer notes

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import regex as re
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
import underthesea

In [None]:
# Extract transfer note from df
transactions_csv = pd.read_csv("transactions.csv")
notes_data = transactions_csv['Description'].tolist()
notes_data

In [None]:
# convert to pf to clean
notes_csv = pd.DataFrame(notes_data,columns=["original_note"])
notes_csv

## Preprocess bank transfer notes

In [None]:
!pip install -U sentence-transformers
!pip install -q pyvi

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import regex as re
from spacy.lang.vi import Vietnamese
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px


In [None]:
# Function to clean the notes
def preprocess_text(text: str) -> str:
    global tokens
    # remove special chars and numbers
    text = re.sub("[^A-Za-z]+", " ", text)

    # tokenize
    token = underthesea.word_tokenize(text)
    text = " ".join(token)
    text = text.lower().strip()
    return text

In [None]:
notes_csv['cleaned_note'] = notes_csv['original_note'].apply(lambda text: preprocess_text(text))
notes_csv = notes_csv[notes_csv['cleaned_note'] != '']
notes_csv

In [None]:
tokens = []
for note in notes_csv['original_note'].tolist():
  # remove special chars and numbers
  note = re.sub("[^A-Za-z]+", " ", note)
  token = underthesea.word_tokenize(note)
  tokens += token

In [None]:
tokens_freq = pd.DataFrame(tokens, columns=["Token"])["Token"].value_counts().reset_index()
tokens_freq

In [None]:
plt.figure(figsize=(10, 5))

# creating the bar plot
plt.title('Top 20 các từ thông dụng trong note chuyển khoản cho MTTQ 1/9/2024 - 2/9/2024', fontsize=16, pad=20)
sns.barplot(x=tokens_freq['Token'][:20],y= tokens_freq['count'][:20],palette='Blues_d')
plt.xlabel("Token")
plt.ylabel("count")
plt.tight_layout()
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
words = []
for note in notes_csv['cleaned_notes'].tolist():
  word = word_tokenize(note)
  words += word

In [None]:
words_csv = pd.DataFrame(words, columns=["word"])
words_csv

In [None]:
word_freq = words_csv['word'].value_counts().sort_values(ascending=False).reset_index()
word_freq

## Convert to embeddings

In [None]:
notes_csv = pd.read_csv("/content/drive/MyDrive/Coding projects/Analyze MTTQ bank transfer/tokens_frequency.csv",index_col=0)
notes_csv.dropna()
notes_csv

In [None]:
# tfidf
vectorizer = TfidfVectorizer(sublinear_tf=True)
X_tfidf = vectorizer.fit_transform(notes_csv['Token'].astype('str')).toarray()

In [None]:
# dang van tuan
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize

model = SentenceTransformer('dangvantuan/vietnamese-embedding')
X_dangvantuan = model.encode(notes_csv['Token'].astype('str'))


## Visualize with PCA

In [None]:
def dimension_reduction(text,embedding, n_components):
    df = pd.DataFrame({'word': text,})
    pca = PCA(n_components=n_components, random_state=42)

    pca_vecs = pca.fit_transform(embedding)

    # save our two dimensions into the result df
    for i in range(n_components):
      x = pca_vecs[:, i]
      df[f'x{i}'] = x
    return df

In [None]:
def plot_pca_3d(result_df, method):

    fig = px.scatter_3d(result_df, x='x0', y='x1', z='x2', hover_name='word',
              range_x = [result_df.x0.min()-1, result_df.x0.max()+1],
              range_y = [result_df.x1.min()-1, result_df.x1.max()+1],
              range_z = [result_df.x2.min()-1, result_df.x2.max()+1],
                        title=f"Relationship between words using {method}")

    fig.update_traces(hovertemplate= '<b>%{hovertext}</b>')
    fig.show()

In [None]:
tfidf_res_df = dimension_reduction(notes_csv['Token'].astype('str').tolist(),X_tfidf,3)
tfidf_res_df

In [None]:
plot_pca_3d(tfidf_res_df, "TFIDF")

In [None]:
vn_result_df = dimension_reduction(notes_csv['Token'].astype('str').tolist(),X_dangvantuan,3)

In [None]:
plot_pca_3d(vn_result_df, "Dang Van Tuan Vietnamese Embeddings")

## Find k cluster

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)

# fit the model
kmeans.fit(X_tfidf)

# store cluster labels in a variable
clusters = kmeans.labels_

# Assign clusters to our dataframe
clusters_result_name = 'cluster_tfidf'


In [None]:
tfidf_res_df[clusters_result_name] = clusters


In [None]:
fig = px.scatter_3d(tfidf_res_df, x='x0', y='x1', z='x2', hover_name='word',
              range_x = [tfidf_res_df.x0.min()-1, tfidf_res_df.x0.max()+1],
              range_y = [tfidf_res_df.x1.min()-1, tfidf_res_df.x1.max()+1],
              range_z = [tfidf_res_df.x2.min()-1, tfidf_res_df.x2.max()+1],
                        title=f"Relationship between words using TFDIF and KNN",
                    color='cluster_tfidf')

fig.update_traces(hovertemplate= '<b>%{hovertext}</b>')
fig.show()