### Work with ML data

In [1]:
import plotly.express as px
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.manifold import TSNE

In [2]:
# –°–±—Ä–æ—Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π –Ω–∞ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –≤—ã–≤–æ–¥–∏–º—ã—Ö —Ä—è–¥–æ–≤
pd.set_option('display.max_rows', None)
 
# –°–±—Ä–æ—Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π –Ω–∞ —á–∏—Å–ª–æ —Å—Ç–æ–ª–±—Ü–æ–≤
pd.set_option('display.max_columns', None)
 
# –°–±—Ä–æ—Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π –Ω–∞ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–∏–º–≤–æ–ª–æ–≤ –≤ –∑–∞–ø–∏—Å–∏
pd.set_option('display.max_colwidth', None)

### Step 1: Data Preprocessing

In [3]:
df = pd.read_csv('ml_data.csv')  
text_messages = df['text']  

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

stop_words = set(stopwords.words('russian'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words and word not in string.punctuation]
    # tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() and word not in string.punctuation]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Mchomak\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


### Step 2: Feature Extraction

In [4]:
stop_words=list(stop_words)

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words) 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

### Step 3: Train DBSCAN model


In [5]:
for eps in range(1, 110, 5):  # Adjust the range as needed
    eps=eps/100
    dbscan = DBSCAN(eps=eps, min_samples=5)
    cluster_labels = dbscan.fit_predict(tfidf_matrix)
    if len(np.unique(cluster_labels)) > 1:  # silhouette_score requires at least 2 clusters
        silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
        print(f"Epsilon: {eps}, Silhouette Score: {silhouette_avg}")
    else:
        print(f"Epsilon: {eps}, Silhouette Score: N/A (Single Cluster)")

Epsilon: 0.01, Silhouette Score: -0.16521308480305497
Epsilon: 0.06, Silhouette Score: -0.16521308480305497
Epsilon: 0.11, Silhouette Score: -0.16521308480305497
Epsilon: 0.16, Silhouette Score: -0.16521308480305497
Epsilon: 0.21, Silhouette Score: -0.16521308480305497
Epsilon: 0.26, Silhouette Score: -0.16521308480305497
Epsilon: 0.31, Silhouette Score: -0.16521308480305497
Epsilon: 0.36, Silhouette Score: -0.16521308480305497
Epsilon: 0.41, Silhouette Score: -0.16521308480305497
Epsilon: 0.46, Silhouette Score: -0.16521308480305497
Epsilon: 0.51, Silhouette Score: -0.16521308480305497
Epsilon: 0.56, Silhouette Score: -0.16521308480305497
Epsilon: 0.61, Silhouette Score: -0.16521308480305497
Epsilon: 0.66, Silhouette Score: -0.16521308480305497
Epsilon: 0.71, Silhouette Score: -0.16521308480305497
Epsilon: 0.76, Silhouette Score: -0.15562996742108248
Epsilon: 0.81, Silhouette Score: -0.1621166126850712
Epsilon: 0.86, Silhouette Score: -0.16343128592699774
Epsilon: 0.91, Silhouette Sco

In [6]:
eps = 0.97
min_samples = 5 

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
df['cluster'] = dbscan.fit_predict(tfidf_matrix)

### Step 4: Dimensionality Reduction with t-SNE

In [7]:
tsne = TSNE(n_components=2, random_state=42)
reduced_features_tsne = tsne.fit_transform(tfidf_matrix.toarray())

### Step 5: visualization with plotly

In [8]:
df['tsne1'] = reduced_features_tsne[:, 0]
df['tsne2'] = reduced_features_tsne[:, 1]

# Create a scatter plot with hover text
fig = px.scatter(df, x='tsne1', y='tsne2', color='cluster',
                 title='Text Clustering Visualization (DBSCAN + t-SNE)',
                 labels={'tsne1': 't-SNE Component 1', 'tsne2': 't-SNE Component 2'},
                 template='plotly', color_continuous_scale='viridis',
                 hover_name=df['text'], hover_data=['cluster'],
                width=1000, height=1000)

# Customize the hover text
fig.update_traces(textposition='top center')

# Show the interactive plot
fig.show()

In [10]:
import plotly.graph_objects as go

# Assuming 'data' DataFrame with 'cluster' column

# Create a scatter plot with hover text and legend using go.Figure
fig = go.Figure()

for cluster in df['cluster'].unique():
    if cluster!=-1:
        cluster_data = df[df['cluster'] == cluster]
        fig.add_trace(go.Scatter(
            x=cluster_data['tsne1'],
            y=cluster_data['tsne2'],
            mode='markers',
            marker=dict(color=cluster, size=10, line=dict(color='black', width=2)),  # Customize marker size and line properties
            text=cluster_data['text'],
            name=f'Cluster {cluster}'
        ))

# Customize layout
fig.update_layout(
    # title='Text Clustering Visualization (DBSCAN + t-SNE)',
    xaxis_title='t-SNE Component 1',
    yaxis_title='t-SNE Component 2',
    legend=dict(title='Clusters', orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0.01),
    hoverlabel=dict(bgcolor="white", font_size=12, namelength=-1),
    width=900, height=900
)

# Show the interactive plot
fig.show()


### Step 6: Watch all clasters text

In [None]:
df = df.sort_values(by='cluster')

In [None]:
df['cluster'].unique()

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [None]:
# Print text messages in each cluster
for cluster_id in df['cluster'].unique():
    if cluster_id!=-1:
        cluster_data = df[df['cluster'] == cluster_id]['text']
        print(f"\nCluster {cluster_id}:\n")
        for text in cluster_data:
            print(text)
        print("="*50)


Cluster 0:

—Ç–æ–≥–¥–∞ –µ—â–µ –ª–∞–¥–Ω–æ
—Ç–æ–≥–¥–∞ –±–±
–õ–∞–¥–Ω–æ
–ª–∞–¥–Ω–æ —É –Ω–µ–≥–æ —Å–∞–º —Å–ø—Ä–æ—à—É
–ª–∞–¥–Ω–æ –±–±
–í—Å–µ –±–±
–õ–∞–¥–Ω–æ
–õ–∞–¥–Ω–æ
–õ–∞–¥–Ω–æ —è —Ç–æ—á–Ω–æ —Ç–µ–ø–µ—Ä—å —Å–ø–∞—Ç—å –±–±

Cluster 1:

+
–∏ –∫–∞–∫ ?
–ö–∞–∫
(((((
.
–ê –Ω–µ
–ù—É –Ω–∏—á–µ–≥–æ
—è x1/5
–ê
–¢—ã –≥–¥–µ ?
–¥–∞ —É–∂
–î–∞
–¥–∞
–î–∞ –≤—Å–µ –Ω–µ –Ω–∞–¥–æ
–î–∞ —É–∂
–Ω—É –µ—â–µ –±—ã
–Ω—É –¥–∞ +-
–∏–ª–∏ –∫–∞–∫ ?
???
–ß—Ç–æ —Ç–æ
–¢—ã –æ —á–µ–º
–≤–æ
–¢–∞–∫ —á—Ç–æ –º–æ–∂–Ω–æ
–ù—É –≤—Å–µ
–£ –º–µ–Ω—è —Ç–∞–∫ —Ç–µ–ø–µ—Ä—å
–í—Å–µ —Ç–æ–≥–¥–∞
—è –≤ 11
–¢–∞–∫
üò¨
–¥–∞
–î–∞
–¥–∞
–î–∞
–¢—ã —Å–∞–º –≥–¥–µ —Å–µ–π—á–∞—Å?
1000
–£ –º–µ–Ω—è —Ç–∞–∫
–∞
–£ –º–µ–Ω—è —Ç–æ–∂–µ –≤—Å–µ
–ò–ª–∏ –∫–∞–∫
12–∫
–∫–æ–Ω–µ—á–Ω–æ
...
–ê —Ç—ã –≥–¥–µ —Å–µ–π—á–∞—Å
—è –æ–± —ç—Ç–æ–º –∂–µ
—Ç–∞–∫ –Ω–µ
–∏ –µ—â–µ –ª—É—á—à–µ –±—É–¥–µ—Ç
+
–ù–µ
–ß—Ç–æ —Ç–æ–∂–µ —Ç–∞–∫ —Å–µ–±–µ
–ù–æ –≤—Å–µ –ø–æ—Ç–æ–º –¥–∞ –ø–æ—Ç–æ–º
–î–∞
–ê –±—ã–ª–∞ 4 –ø—Ä–æ
–ê
–º–æ–∂–Ω–æ
–•–æ—Ä–æ—à–æ
–ò —á—Ç–æ —Ç–∞–º?
–æ
–¥–∞

Cluster 2:

–ê–≥–∞ —Ö–∞—Ö–∞
–ê–≥–∞
–∞–≥–∞
—Ö–∞—