In [1]:
# !pip install sentence-transformers -q
# !pip install --upgrade gensim -q

In [2]:
import gensim
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
import nltk
import re
from operator import itemgetter
import time
import multiprocessing
from sentence_transformers import SentenceTransformer

EPOCHS = 265
TOPICS = 4
CHUNK_SIZE = 1000
WORKERS = 7
EVAL_PERIOD = 10

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbaxi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mbaxi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbaxi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

## Load Data

In [5]:
train_df = pd.read_csv('../../data/ag_news/train.csv')
test_df = pd.read_csv('../../data/ag_news/test.csv')

In [6]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

for index, row in test_df.iterrows():
    if row['class'] == 1:
        test_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        test_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        test_df.at[index, 'topic'] = 'Business'
    else:
        test_df.at[index, 'topic'] = 'Sci/Tech'

In [7]:
# Checking the average length of news article for each class
train_avg_len_by_class = (train_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))
test_avg_len_by_class = (test_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))

In [8]:
train_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,198.690267
1,Sci/Tech,193.852533
2,Sports,185.171367
3,World,195.8939


In [9]:
test_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,197.831053
1,Sci/Tech,193.951579
2,Sports,183.838947
3,World,194.010526


In [10]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))
test_df['news_tokenized'] = test_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

100%|██████████| 120000/120000 [02:21<00:00, 847.06it/s] 
100%|██████████| 7600/7600 [00:06<00:00, 1205.63it/s]


### Sentence BERT

In [11]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
sbert_train_embeddings = embedder.encode(train_df['news_tokenized'])
sbert_test_embeddings = embedder.encode(test_df['news_tokenized'])

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 581kB/s]
Downloading: 100%|██████████| 10.2k/10.2k [00:00<00:00, 649kB/s]
Downloading: 100%|██████████| 612/612 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 116/116 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 1.61MB/s]
Downloading: 100%|██████████| 349/349 [00:00<?, ?B/s] 
Downloading: 100%|██████████| 90.9M/90.9M [00:04<00:00, 20.7MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 115kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 2.48MB/s]
Downloading: 100%|██████████| 350/350 [00:00<00:00, 361kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<?, ?B/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 1.41MB/s]
Downloading: 100%|██████████| 190/190 [00:00<?, ?B/s] 


In [None]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','clustering-algo', 'run#', 'state', 'AMI','ARI','NMI','time'])

### k-means

In [None]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    k_means = KMeans(n_clusters=TOPICS, init='k-means++', max_iter=EPOCHS, random_state=state)
    k_means.fit(sbert_train_embeddings)
    
    pred_labels = k_means.fit_predict(sbert_test_embeddings)
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'k-means', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)
    break

Run # 1


### NMF

In [15]:
train_documents = train_df['news_tokenized'].str.split()
dictionary = gensim.corpora.Dictionary(train_documents)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=20000)

test_documents = test_df['news_tokenized'].str.split()

train_corpus = [dictionary.doc2bow(document) for document in train_documents]
test_corpus = [dictionary.doc2bow(document) for document in test_documents]

# train_corpus_doc2vec = list(doc2vec_model[train_corpus])
# test_corpus_doc2vec = list(doc2vec_model[test_corpus])

In [32]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_nmf = gensim.models.Nmf(corpus=train_corpus, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE, passes=EPOCHS, eval_every=EVAL_PERIOD, minimum_probability=0, 
                                   random_state=state, kappa=1)
    
    pred_labels = []
    for test_doc in test_corpus:
        pred_label = max(gensim_nmf[test_doc], key=itemgetter(1))[0]
        pred_labels.append(pred_label)
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
        
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'NMF', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)
    break

Run # 1


### LDA

In [None]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_lda = gensim.models.ldamulticore.LdaMulticore(corpus=train_corpus, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE, workers=WORKERS, passes=EPOCHS, 
                                                     eval_every = EVAL_PERIOD, per_word_topics=True, random_state=state)
    
    pred_labels = []
    for test_doc in test_corpus:
        pred_label = max(gensim_lda[test_doc][0], key=itemgetter(1))[0]
        pred_labels.append(pred_label)
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
        
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'LDA', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)
    break

### LSI

In [None]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_lsi = gensim.models.LsiModel(corpus=train_corpus, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE)
    
    pred_labels = []
    for test_doc in test_corpus:
        pred_label = max(gensim_lsi[test_doc], key=itemgetter(1))[0]
        pred_labels.append(pred_label)
            
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
      
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'LSI', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)
    break

In [None]:
performance_metrics