In [3]:
import pandas as pd
import numpy as np

In [6]:
import os
import random
import re
import string

import nltk

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [7]:
SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

In [8]:
df = pd.read_csv('mixed_user.csv',index_col=[0])

In [9]:
df['text']

0        Russian  Trading Continues Even With Tightenin...
1        Polygon includes $23 billion sports betting co...
2        FTX Is Coming to Europe\n\nSentiment: Positive...
3        Coinbase blocks 25,000  wallets tied to Russia...
4        Crypto Exchange Binance To Launch New Payments...
                               ...                        
18239    Although BTC Outflow is used as a bullish sign...
18240              We conducted research on Plus Token! \n
18241    We conduct research on Plus Token! They aggres...
18242      Hey, we also did some research! It seems tha...
18243                         Just setting up my Twitter. 
Name: text, Length: 18139, dtype: object

In [10]:
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neelj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\neelj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def clean_text(text, tokenizer, stopwords):
    text = str(text).lower()
    text = re.sub(r"\[(.*?)\]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\w+…|…", "", text)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    tokens = tokenizer(text)
    tokens = [t for t in tokens if not t in stopwords]
    tokens = ["" if t.isdigit() else t for t in tokens]
    tokens = [t for t in tokens if len(t) > 1]
    return tokens

In [12]:
df['tokens'] = df['text'].map(lambda x: clean_text(x, word_tokenize, stopwords.words("english")))

In [13]:
tokenized_lists = df['tokens'].tolist()

In [14]:
model = Word2Vec(sentences=tokenized_lists, vector_size=100, workers=1, seed=SEED)

In [15]:
def vectorize(tokenized_lists, model):
    features = []
    for tokens in tokenized_lists:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [16]:
vectorized_lists = vectorize(tokenized_lists, model)
len(vectorized_lists), len(vectorized_lists[0])

(18139, 100)

In [17]:
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [18]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_lists, k=10, mb=500, print_silhouette_values=True)
df_clusters = pd.DataFrame({"text": df['text'], "tokens": [" ".join(text) for text in tokenized_lists], "cluster": cluster_labels})

For n_clusters = 10
Silhouette coefficient: 0.14
Inertia:9575.272598783751
Silhouette values:
    Cluster 7: Size:415 | Avg:0.45 | Min:0.00 | Max: 0.63
    Cluster 9: Size:289 | Avg:0.22 | Min:-0.11 | Max: 0.48
    Cluster 4: Size:4416 | Avg:0.20 | Min:-0.04 | Max: 0.43
    Cluster 3: Size:2754 | Avg:0.17 | Min:-0.08 | Max: 0.40
    Cluster 1: Size:1568 | Avg:0.16 | Min:-0.04 | Max: 0.37
    Cluster 2: Size:1175 | Avg:0.14 | Min:-0.11 | Max: 0.38
    Cluster 8: Size:876 | Avg:0.11 | Min:-0.15 | Max: 0.34
    Cluster 5: Size:1834 | Avg:0.10 | Min:-0.16 | Max: 0.35
    Cluster 0: Size:3292 | Avg:0.09 | Min:-0.19 | Max: 0.32
    Cluster 6: Size:1520 | Avg:0.01 | Min:-0.25 | Max: 0.29


In [19]:
df_clusters

Unnamed: 0,text,tokens,cluster
0,Russian Trading Continues Even With Tightenin...,russian trading continues even tightening sanc...,4
1,Polygon includes $23 billion sports betting co...,polygon includes billion sports betting compan...,3
2,FTX Is Coming to Europe\n\nSentiment: Positive...,ftx coming europe sentiment positive cryptocur...,6
3,"Coinbase blocks 25,000 wallets tied to Russia...",coinbase blocks wallets tied russians suspecte...,4
4,Crypto Exchange Binance To Launch New Payments...,crypto exchange binance launch new payments te...,3
...,...,...,...
18239,Although BTC Outflow is used as a bullish sign...,although btc outflow used bullish signal look ...,0
18240,We conducted research on Plus Token! \n,conducted research plus token,3
18241,We conduct research on Plus Token! They aggres...,conduct research plus token aggressively used ...,3
18242,"Hey, we also did some research! It seems tha...",hey also research seems 3lnmrygaq8hdsfjxvpmsna...,0


In [20]:
print("Most representative terms per cluster (based on centroids):")
for i in range(10):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: morning gained doubled broader climbed 
Cluster 1: washington authorities legislation nations spain 
Cluster 2: mainnet harmony chainlink successfully protocols 
Cluster 3: space organization version ticketing esports 
Cluster 4: millennials citing reports wealthy debt 
Cluster 5: dropped drops increased hour surged 
Cluster 6: boss dimon director citadel facebook 
Cluster 7: chivo salvadoran volcanic tourism adopt 
Cluster 8: nigeria americas switzerland cbdc switzerlands 
Cluster 9: cofounder creator alexis ohanian markus 
