In [1]:
import gensim
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
import nltk
import re
import plotly.graph_objects as go

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = set(nltk.corpus.stopwords.words('english'))
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

## Load Data

In [4]:
train_df = pd.read_csv('train.csv')

In [6]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

In [7]:
# Checking the average length of news article for each class
train_avg_len_by_class = (train_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))

In [8]:
train_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,198.690267
1,Sci/Tech,193.852533
2,Sports,185.171367
3,World,195.8939


In [9]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

100%|██████████| 120000/120000 [02:09<00:00, 925.38it/s]


### Generate feature vectors and find optimal epochs for k-means

In [10]:
tf_idfvectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=2)
x_tfidf = tf_idfvectorizer.fit_transform(train_df.news_tokenized)

In [11]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','run#', 'epoch', 'state', 'AMI','ARI','NMI'])

In [13]:
for run in range(1, 21, 1):
    print('Run #', run)
    for epoch, state in zip(range(25, 300, 15), range(2, 42, 2)):
        k_means = KMeans(n_clusters=4, init='k-means++', max_iter=epoch, random_state=state)
        k_means.fit(x_tfidf)
        
        ami = metrics.adjusted_mutual_info_score(train_df['class'], k_means.labels_)
        ari = metrics.adjusted_rand_score(train_df['class'], k_means.labels_)
        nmi = metrics.normalized_mutual_info_score(train_df['class'], k_means.labels_)

        # print(run, epoch, state)
        
        performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'run#':run, 'epoch':epoch, 'state':state, 'AMI':ami, 'ARI': ari, 'NMI':nmi}, ignore_index=True)

Run # 1
Run # 2
Run # 3
Run # 4
Run # 5
Run # 6
Run # 7
Run # 8
Run # 9
Run # 10
Run # 11
Run # 12
Run # 13
Run # 14
Run # 15
Run # 16
Run # 17
Run # 18
Run # 19
Run # 20


In [15]:
performance_metrics.to_csv('performance-tfidf_kmeans.csv', index=False)

In [17]:
mean_performance = performance_metrics.groupby('epoch', as_index=False)[['AMI','ARI','NMI']].mean()

In [18]:
mean_performance.to_csv('mean_performance-tfidf_kmeans.csv', index=False)