In [1]:
import gensim
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
from utils import *

## Load Data

In [2]:
train_df = pd.read_csv('../data/ag_news/train.csv')
test_df = pd.read_csv('../data/ag_news/test.csv')

In [3]:
train_df.columns

Index(['class', 'title', 'description'], dtype='object')

In [4]:
for index, row in train_df.iterrows():
    if row['class'] == 1:
        train_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        train_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        train_df.at[index, 'topic'] = 'Business'
    else:
        train_df.at[index, 'topic'] = 'Sci/Tech'

In [5]:
for index, row in test_df.iterrows():
    if row['class'] == 1:
        test_df.at[index, 'topic'] = 'World'
    elif row['class'] == 2:
        test_df.at[index, 'topic'] = 'Sports'
    elif row['class'] == 3:
        test_df.at[index, 'topic'] = 'Business'
    else:
        test_df.at[index, 'topic'] = 'Sci/Tech'

In [6]:
# Plots to check distribution of classes in the datasets
fig_train = px.bar(train_df, x=train_df['topic'].value_counts().index, y=train_df['topic'].value_counts().values, color=train_df['topic'].value_counts().index, 
                   title='Distribution of topics in training dataset', width=1100, height=500,
                   labels={
                       'x':'Topic', 'y':'Number of News Articles','color':'Topic'
                   })
fig_train.show()
fig_train.write_image('../results/EDA/topic-dist-train.pdf', engine='kaleido')

fig_test = px.bar(test_df, x=test_df['topic'].value_counts().index, y=test_df['topic'].value_counts().values, color=test_df['topic'].value_counts().index, 
                   title='Distribution of topics in testing dataset', width=1100, height=500,
                   labels={
                       'x':'Topic', 'y':'Number of News Articles','color':'Topic'
                   })
fig_test.show()
fig_test.write_image('../results/EDA/topic-dist-test.pdf', engine='kaleido')

In [7]:
# Checking the average length of news article for each class
train_avg_len_by_class = (train_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))
test_avg_len_by_class = (test_df.groupby('topic')['description'].apply(lambda x: np.mean(x.str.len())).reset_index(name='avg_news_len'))

In [8]:
train_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,198.690267
1,Sci/Tech,193.852533
2,Sports,185.171367
3,World,195.8939


In [9]:
test_avg_len_by_class

Unnamed: 0,topic,avg_news_len
0,Business,197.831053
1,Sci/Tech,193.951579
2,Sports,183.838947
3,World,194.010526


In [11]:
# Preprocess the news description
tqdm.tqdm.pandas()
train_df['news_tokenized'] = train_df['description'].progress_apply(lambda x: preprocess_text(str(x)))
test_df['news_tokenized'] = test_df['description'].progress_apply(lambda x: preprocess_text(str(x)))

100%|██████████| 120000/120000 [01:42<00:00, 1169.74it/s]
100%|██████████| 7600/7600 [00:06<00:00, 1168.30it/s]


## Feature Representations

### TF-IDF

In [12]:
ag_news_df = pd.concat([train_df, test_df], ignore_index=True)

In [13]:
tf_idfvectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=2)
x_tfidf = tf_idfvectorizer.fit_transform(ag_news_df.news_tokenized)

In [14]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','epoch','AMI','ARI','NMI'])

In [17]:
for epoch in range(25, 300, 25):
    k_means = KMeans(n_clusters=4, init='k-means++', max_iter=epoch)
    k_means.fit(x_tfidf)
    
    ami = metrics.adjusted_mutual_info_score(ag_news_df['class'], k_means.labels_)
    ari = metrics.adjusted_rand_score(ag_news_df['class'], k_means.labels_)
    nmi = metrics.normalized_mutual_info_score(ag_news_df['class'], k_means.labels_)
    
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'epoch':epoch, 'AMI':ami, 'ARI': ari, 'NMI':nmi}, ignore_index=True)

In [18]:
performance_metrics

Unnamed: 0,feature-extraction,epoch,AMI,ARI,NMI
0,tf-idf,25,0.231659,0.166495,0.231683
