In [None]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


Imports and Auxiliary Functions for Vectorization

In [None]:
import re
from unidecode import unidecode
from sklearn.datasets import fetch_20newsgroups as fetch_data
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

def remove_punctuation(text):
  return re.sub(r"[^\w\s\d]", '', text)



Data load

In [None]:
data = []

for input in fetch_data().data:
  input = unidecode(input)
  input = input.lower()
  input = remove_punctuation(input)
  data.append(input)

TF-IDF

In [None]:
tf_idf_data = []
tf_idf = TfidfVectorizer(smooth_idf=False)
for input in data:
    input_encoding = tf_idf.fit_transform(input.split('\n')).toarray().tolist()
    tf_idf_data.append(input_encoding)

In [None]:
import numpy as np
tf_idf_data = np.array(tf_idf_data)

Word2Vec

In [None]:
model = None
for input in data:
    words = []
    for line in input.split('\n'):
      words.append(line.split())
    model = Word2Vec(sentences=words,
                     vector_size=100,
                     window=5,
                     min_count=1,
                     workers=4)

In [None]:
word2vec_data = model.wv.vectors

K-Means (K=4)

In [31]:
from sklearn.cluster import KMeans

kmeans_tf_idf = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(tf_idf_data)
kmeans_word2vec = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(word2vec_data)

Spectral Clustering (K=6)

In [32]:
from sklearn.cluster import SpectralClustering

spectral_tf_idf = SpectralClustering(n_clusters=6, random_state=0).fit(tf_idf_data)
spectral_word2vec = SpectralClustering(n_clusters=6, random_state=0).fit(word2vec_data)

Gaussian Mixture

In [33]:
from sklearn.mixture import GaussianMixture

gaussian_tf_idf = GaussianMixture(n_components=2, random_state=0).fit(tf_idf_data)
gaussian_word2vec = GaussianMixture(n_components=2, random_state=0).fit(word2vec_data)

Agglomerative Clustering

In [34]:
from sklearn.cluster import AgglomerativeClustering

agglomerative_tf_idf = AgglomerativeClustering().fit(tf_idf_data)
agglomerative_word2vec = AgglomerativeClustering().fit(word2vec_data)

DBSCAN

In [35]:
from sklearn.cluster import DBSCAN

dbscan_tf_idf = DBSCAN(eps=3, min_samples=2).fit(tf_idf_data)
dbscan_word2vec = DBSCAN(eps=3, min_samples=2).fit(word2vec_data)

HDBSCAN

In [36]:
!pip install hdbscan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.29-cp39-cp39-linux_x86_64.whl size=3580448 sha256=3e2da52bf9f149cc8833ab2b2d988c647f4687ebc650ce8ff1b4ee3501d49c7e
  Stored in directory: /root/.cache/pip/wheels/05/6f/88/1a4c04276b98306f00217a1e300e6ba0252c6aa4f7616067ae
Successfully built hdbscan
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.29


In [37]:
import hdbscan

hdbscan_tf_idf = hdbscan.HDBSCAN().fit(tf_idf_data)
hbscan_word2vec = hdbscan.HDBSCAN().fit(word2vec_data)