In [None]:
!pip install sentence-transformers -q

In [None]:
import gensim
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
import nltk
import re
import multiprocessing
from sentence_transformers import SentenceTransformer

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

### Load Data

In [None]:
train_df = pd.read_csv('../input/ag-news/train.csv')
test_df = pd.read_csv('../input/ag-news/test.csv')

In [None]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

for index, row in test_df.iterrows():
    if row['class'] == 1:
        test_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        test_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        test_df.at[index, 'topic'] = 'Business'
    else:
        test_df.at[index, 'topic'] = 'Sci/Tech'

In [None]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))
test_df['news_tokenized'] = test_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

In [None]:
ag_news_df = pd.concat([train_df, test_df], ignore_index=True)

### Sentence BERT (SBERT)

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
sbert_embeddings = embedder.encode(ag_news_df['news_tokenized'])

### Find optimal epochs for k-means

In [None]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','run#', 'epoch', 'state', 'AMI','ARI','NMI'])

In [None]:
for run in range(1, 11, 1):
    print('Run #', run)
    for epoch, state in zip(range(25, 300, 15), range(2, 40, 2)):
        k_means = KMeans(n_clusters=4, init='k-means++', max_iter=epoch, random_state=state)
        k_means.fit(sbert_embeddings)
        
        ami = metrics.adjusted_mutual_info_score(ag_news_df['class'], k_means.labels_)
        ari = metrics.adjusted_rand_score(ag_news_df['class'], k_means.labels_)
        nmi = metrics.normalized_mutual_info_score(ag_news_df['class'], k_means.labels_)

        # print(run, epoch, state)
        
        performance_metrics = performance_metrics.append({'feature-extraction':'SBERT', 'run#':run, 'epoch':epoch, 'state':state, 'AMI':'{:f}'.format(ami), 'ARI':'{:f}'.format(ari), 'NMI':'{:f}'.format(nmi)}, ignore_index=True)

In [None]:
performance_metrics.to_csv('performance_sbert_kmeans.csv', index=False)

In [None]:
mean_performance = performance_metrics.groupby('epoch', as_index=False)[['AMI','ARI','NMI']].mean()

In [None]:
mean_performance.to_csv('mean_performance-fasttext_kmeans.csv', index=False)