In [1]:
!pip install sentence-transformers -q
!pip install --upgrade gensim -q
!pip install bertopic -q

In [2]:
import gensim
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
import nltk
import re
from operator import itemgetter
import time
import multiprocessing
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

EPOCHS = 265
TOPICS = 4
CHUNK_SIZE = 1000
WORKERS = 7
EVAL_PERIOD = 10

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

## Load Data

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [6]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

for index, row in test_df.iterrows():
    if row['class'] == 1:
        test_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        test_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        test_df.at[index, 'topic'] = 'Business'
    else:
        test_df.at[index, 'topic'] = 'Sci/Tech'

In [7]:
# Checking the average length of news article for each class
train_avg_len_by_class = (train_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))
test_avg_len_by_class = (test_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))

In [8]:
train_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,198.690267
1,Sci/Tech,193.852533
2,Sports,185.171367
3,World,195.8939


In [9]:
test_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,197.831053
1,Sci/Tech,193.951579
2,Sports,183.838947
3,World,194.010526


In [10]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))
test_df['news_tokenized'] = test_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

100%|██████████| 120000/120000 [02:20<00:00, 853.61it/s]
100%|██████████| 7600/7600 [00:08<00:00, 865.04it/s]


### Sentence BERT

In [11]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
sbert_train_embeddings = embedder.encode(train_df['news_tokenized'])
sbert_test_embeddings = embedder.encode(test_df['news_tokenized'])

In [12]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','clustering-algo', 'run#', 'state', 'AMI','ARI','NMI','time'])

### k-means

In [13]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    k_means = KMeans(n_clusters=TOPICS, init='k-means++', max_iter=EPOCHS, random_state=state)
    k_means.fit(sbert_train_embeddings)
    
    pred_labels = k_means.fit_predict(sbert_test_embeddings)
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'Sentence BERT', 'clustering-algo':'k-means', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)

Run # 1
Run # 2
Run # 3
Run # 4
Run # 5
Run # 6
Run # 7
Run # 8
Run # 9
Run # 10
Run # 11
Run # 12
Run # 13
Run # 14
Run # 15
Run # 16
Run # 17
Run # 18
Run # 19
Run # 20


In [14]:
performance_metrics

Unnamed: 0,feature-extraction,clustering-algo,run#,state,AMI,ARI,NMI,time
0,Sentence BERT,k-means,1,2,0.498843,0.520087,0.499059,37.054173
1,Sentence BERT,k-means,2,4,0.499652,0.518302,0.499867,47.199358
2,Sentence BERT,k-means,3,6,0.499513,0.519848,0.499728,40.775629
3,Sentence BERT,k-means,4,8,0.499652,0.518302,0.499867,36.421963
4,Sentence BERT,k-means,5,10,0.498689,0.517889,0.498904,36.470073
5,Sentence BERT,k-means,6,12,0.49934,0.521579,0.499555,48.355836
6,Sentence BERT,k-means,7,14,0.49779,0.517151,0.498006,58.114293
7,Sentence BERT,k-means,8,16,0.499607,0.518352,0.499822,36.345726
8,Sentence BERT,k-means,9,18,0.499549,0.517998,0.499765,36.389335
9,Sentence BERT,k-means,10,20,0.498844,0.520394,0.499059,45.455332


### BERTopic

In [15]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", verbose=True, nr_topics=4).fit(train_df['news_tokenized'], y=train_df['class'])
    
    pred_labels = topic_model.transform(test_df['news_tokenized'])
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels[0])
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels[0])
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels[0])
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'Sentence BERT', 'clustering-algo':'BERTopic', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)
    # break

Run # 1


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 17:19:08,702 - BERTopic - Transformed documents to Embeddings
2021-12-13 17:23:15,940 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 17:23:30,734 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 17:24:45,673 - BERTopic - Reduced number of topics from 1333 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 2


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 17:27:48,364 - BERTopic - Transformed documents to Embeddings
2021-12-13 17:31:43,974 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 17:31:58,421 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 17:33:13,838 - BERTopic - Reduced number of topics from 1344 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 3


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 17:36:03,983 - BERTopic - Transformed documents to Embeddings
2021-12-13 17:39:57,038 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 17:40:11,188 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 17:41:26,800 - BERTopic - Reduced number of topics from 1340 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 4


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 17:44:17,103 - BERTopic - Transformed documents to Embeddings
2021-12-13 17:48:11,465 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 17:48:25,539 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 17:49:41,620 - BERTopic - Reduced number of topics from 1369 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 5


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 17:52:31,597 - BERTopic - Transformed documents to Embeddings
2021-12-13 17:56:26,514 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 17:56:40,922 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 17:57:59,080 - BERTopic - Reduced number of topics from 1368 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 6


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:00:49,562 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:04:44,226 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:04:58,613 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:06:15,855 - BERTopic - Reduced number of topics from 1352 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 7


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:09:06,432 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:13:00,479 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:13:14,896 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:14:31,767 - BERTopic - Reduced number of topics from 1367 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 8


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:17:22,471 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:21:17,287 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:21:31,672 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:22:49,392 - BERTopic - Reduced number of topics from 1313 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 9


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:25:40,839 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:29:35,714 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:29:50,291 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:31:06,149 - BERTopic - Reduced number of topics from 1351 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 10


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:33:57,093 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:37:54,536 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:38:08,960 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:39:25,605 - BERTopic - Reduced number of topics from 1374 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 11


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:42:16,706 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:46:09,506 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:46:23,825 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:47:41,049 - BERTopic - Reduced number of topics from 1355 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 12


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:50:32,056 - BERTopic - Transformed documents to Embeddings
2021-12-13 18:54:26,293 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 18:54:40,668 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 18:55:56,813 - BERTopic - Reduced number of topics from 1351 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 13


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 18:58:47,748 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:02:40,235 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:02:54,121 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:04:12,200 - BERTopic - Reduced number of topics from 1364 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 14


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:07:03,182 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:10:57,645 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:11:11,750 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:12:27,980 - BERTopic - Reduced number of topics from 1345 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 15


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:15:19,618 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:19:15,559 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:19:29,592 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:20:50,263 - BERTopic - Reduced number of topics from 1367 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 16


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:23:40,833 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:27:36,301 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:27:50,782 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:29:08,849 - BERTopic - Reduced number of topics from 1359 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 17


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:32:00,716 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:35:54,485 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:36:08,845 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:37:25,374 - BERTopic - Reduced number of topics from 1344 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 18


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:40:16,485 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:44:07,884 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:44:21,868 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:45:36,533 - BERTopic - Reduced number of topics from 1324 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 19


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:48:26,305 - BERTopic - Transformed documents to Embeddings
2021-12-13 19:52:19,730 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 19:52:33,662 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 19:53:49,512 - BERTopic - Reduced number of topics from 1351 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


Run # 20


Batches:   0%|          | 0/3750 [00:00<?, ?it/s]

2021-12-13 19:56:40,261 - BERTopic - Transformed documents to Embeddings
2021-12-13 20:00:34,007 - BERTopic - Reduced dimensionality with UMAP
2021-12-13 20:00:47,877 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-13 20:02:02,595 - BERTopic - Reduced number of topics from 1325 to 5


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

  self._set_arrayXarray(i, j, x)


In [17]:
performance_metrics.to_csv('evaluating_performance-sbert.csv', index=False)