In [None]:
! pip install scikit-learn
! pip install prince
! pip install sentence_transformers

In [24]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
from sklearn.cluster import KMeans

'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

'''

'\nVariables: \n---------\n\ncorpus : list of documents\nembeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) \nred_emd : reduced embeddings matrix using dimentionality reduction\nk : number of clusters\nlabels : documents labels\npred : list of clustering predicted clusters \n\n'

In [39]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP list such that p<<m
    '''
    
    # Initialisation de l'analyse des correspondances
    ca = TSNE(
        n_components = p,  
        learning_rate='auto',      
        init='random', 
        perplexity=3
    ).fit_transform(mat)
    
    # ca = TSNE(n_components=p, random_state=0).fit_transform(mat)

    return ca

In [15]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)


In [16]:
embeddings

array([[-0.57739985, -0.03035784, -0.10226065, ..., -0.37560168,
        -0.09159009, -0.05665841],
       [-0.54764974, -0.1471617 , -0.10938013, ..., -0.18822178,
        -0.07229097, -0.15016393],
       [ 0.00645877,  0.16189736, -0.22414432, ..., -0.38403475,
         0.30487227,  0.17581558],
       ...,
       [-0.23235753,  0.01727558, -0.21409535, ..., -0.24929887,
        -0.13749793,  0.03162361],
       [-0.356682  ,  0.09248027, -0.26561555, ..., -0.08004162,
        -0.3139933 ,  0.0114701 ],
       [-0.39642942, -0.14386131, -0.07510002, ..., -0.00914432,
        -0.09982094, -0.12229361]], dtype=float32)

In [40]:
# perform dimentionality reduction
red_emb = dim_red(embeddings, 3)

In [41]:
def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list 
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
        
    # Création de l'objet KMeans
    kmeans = KMeans(n_clusters=k)

    # Ajustement du modèle et prédiction des clusters
    pred = kmeans.fit_predict(mat)
    
    return pred
    

# perform clustering
pred = clust(red_emb, k)


  super()._check_params_vs_input(X, default_n_init=10)


In [42]:
pred

array([13,  2,  1, ..., 17, 19, 10], dtype=int32)

In [45]:
# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')

NMI: 0.35 
ARI: 0.17
