In [None]:
!pip install sentence_transformers

In [5]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import os
import csv
from sentence_transformers import SentenceTransformer, util

In [8]:
url = "https://raw.githubusercontent.com/PolyAI-LDN/task-specific-datasets/master/banking_data/test.csv"
dataset_path = "test.csv"
max_corpus_size = 50000  # We limit our corpus to only the first 50k questions


# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

# Get all unique sentences from the file
corpus_sentences = []
category = []
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.append(row['text'])
        category.append(row['category'])
        if len(corpus_sentences) >= max_corpus_size:
            break
print("Encode the corpus. This might take a while")

embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(corpus_sentences)

Encode the corpus. This might take a while


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [10]:
corpus_embeddings.shape

(3080, 384)

In [9]:
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample

In [12]:
new_dimension = 128
#Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
reduced_emb = pca.fit_transform(corpus_embeddings)

In [29]:
%%time
from sklearn.cluster import KMeans
# Perform kmean clustering
num_clusters = len(set(category))
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(reduced_emb)
km_result = clustering_model.labels_

CPU times: user 6.02 s, sys: 3.69 s, total: 9.71 s
Wall time: 5.05 s


In [34]:
adjusted_rand_score(km_result,category)

0.49696239063457176

In [36]:
normalized_mutual_info_score(km_result,category)

0.7882765311198084

In [31]:
%%time
from sklearn.cluster import AgglomerativeClustering
num_clusters = len(set(category))
clustering_model = AgglomerativeClustering(n_clusters=num_clusters, affinity='cosine', linkage='average') #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(reduced_emb)
ag_result = clustering_model.labels_

CPU times: user 985 ms, sys: 6.57 ms, total: 991 ms
Wall time: 1.22 s


In [35]:
adjusted_rand_score(ag_result,category)

0.3405396121349602

In [37]:
normalized_mutual_info_score(ag_result,category)

0.763132026149143