In [None]:
# !pip install gensim --upgrade -q

In [None]:
import gensim
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import tqdm
import nltk
import re
import time

EPOCHS = 40
TOPICS = 4
CHUNK_SIZE = 1000
WORKERS = 7
EVAL_PERIOD = 10

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

## Load Data

In [None]:
train_df = pd.read_csv('../../data/ag_news/train.csv')
test_df = pd.read_csv('../../data/ag_news/test.csv')

In [None]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

for index, row in test_df.iterrows():
    if row['class'] == 1:
        test_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        test_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        test_df.at[index, 'topic'] = 'Business'
    else:
        test_df.at[index, 'topic'] = 'Sci/Tech'

In [None]:
# Checking the average length of news article for each class
train_avg_len_by_class = (train_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))
test_avg_len_by_class = (test_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))

In [None]:
train_avg_len_by_class

In [None]:
test_avg_len_by_class

In [None]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))
test_df['news_tokenized'] = test_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

### Doc2Vec

In [None]:
train_tg_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(train_df['news_tokenized'])]

In [None]:
num_features = 100;                              # Dimensionality of the hidden layer representation
min_word_count = 40;                             # Minimum word count to keep a word in the vocabulary

doc2vec_model = gensim.models.doc2vec.Doc2Vec(train_tg_documents, workers=WORKERS, vector_size=num_features, min_count=min_word_count, dm=1,alpha=0.025, min_alpha=0.001)
doc2vec_model.train(train_tg_documents, total_examples=doc2vec_model.corpus_count, epochs=EPOCHS, start_alpha=0.002, end_alpha=-0.016)

x_train_doc2vec = doc2vec_model.dv.vectors

In [None]:
x_test_doc2vec = []

for doc_id in range(len(test_df)):
    inferred_vector = doc2vec_model.infer_vector(test_df['news_tokenized'][doc_id].split())
    x_test_doc2vec.append(inferred_vector)

In [None]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','clustering-algo', 'run#', 'state', 'AMI','ARI','NMI','time'])

### k-means

In [None]:
for run, state in zip(range(1, 21, 1), range(2, 42, 2)):
    print('Run #', run)
    
    start = time.time()
    k_means = KMeans(n_clusters=TOPICS, init='k-means++', max_iter=EPOCHS, random_state=state)
    k_means.fit(x_train_doc2vec)
    
    pred_labels = k_means.fit_predict(x_test_doc2vec)
        
    ami = metrics.adjusted_mutual_info_score(test_df['class'], pred_labels)
    ari = metrics.adjusted_rand_score(test_df['class'], pred_labels)
    nmi = metrics.normalized_mutual_info_score(test_df['class'], pred_labels)
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'Doc2Vec', 'clustering-algo':'k-means', 'run#':run, 'state':state, 'AMI':ami, 
                                                      'ARI': ari, 'NMI':nmi, 'time':(stop-start)}, ignore_index=True)

In [None]:
performance_metrics