In [57]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [58]:
train_path = os.path.join("../input/learn-ai-bbc/", "BBC News Train.csv")
test_path = os.path.join("../input/learn-ai-bbc/", "BBC News Test.csv")

In [59]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

Let's take a look at what the train and test dataset look like:

In [60]:
train_df.head()

In [61]:
test_df.head()

In [62]:
train_df.info()

In [63]:
train_df.Text[0]

You can see that there are potential problems in the text. Each cell of text is a single long string. A lot of word spacing induced because a lot of punctuation was removed. Not all punctuation was removed and you have things like dollar signs. TfidfVectorizer will do a lot of the work for us with stopwords so I think it'll likely be fine.

In [64]:
train_df.Category.value_counts()

In [65]:
train_df.Category.value_counts()/train_df.Category.value_counts().sum()

There's a good split of everything with sports and business having the most amount of datat but not terribly lopsided with a singlular catgeory below 5% or anything like that. There are nearly 1500 rows so every category should have a good size and we can see below that there aren't any missing labels either.

In [66]:
train_df.Category.isnull().sum()

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


We're gonna set min word count to 20. Previously I tried 5 which gave more accurate results on the training data, but it had more features than what was provided in the test set. I think this is because there are words that ended being used as features in train, but did not appear often enough in test and it creates a dimensionality error. For NMF, n-components we have to pick 5 because we know there are 5 labels.

In [127]:
vectorized = TfidfVectorizer(stop_words='english', min_df=20, sublinear_tf=True)
X = vectorized.fit_transform(train_df.Text)
model = NMF(n_components=5, random_state=1234)

In [128]:
model.fit(X)
pred = model.transform(X)

In [129]:
pred[0:6]

Index of highest value in each row is the predicted label category.

In [130]:
preds = pred.argmax(1)
preds[0:6]

Find highest word values in each component to determine label from integer class.

In [131]:
components_df = pd.DataFrame(model.components_, columns=vectorized.get_feature_names())
components_df

In [132]:
for cat in range(components_df.shape[0]):
    tmp = components_df.iloc[cat]
    print(f'For category {cat} the words with the highest value are:')
    print(tmp.nlargest(6))
    print('\n')

In [133]:
lab_dict = {0:'tech', 1:'sport', 2:'politics', 3:'entertainment', 4:'business'}
labels = np.vectorize(lab_dict.get)(preds) 
labels[0:6]

In [134]:
acc_NMF = np.sum(labels == train_df.Category)/len(labels)
acc_NMF

In [135]:
X_test = vectorized.transform(test_df.Text)
nmf_pred_test = model.transform(X_test)
nmf_preds_test = nmf_pred_test.argmax(1)
nmf_labels_test = np.vectorize(lab_dict.get)(nmf_preds_test) 
pd.DataFrame(data={'ArticleId':test_df.ArticleId, 'Category':nmf_labels_test}).to_csv('submission.csv', index = False)

NMF Test Score: 0.93877

Now we're gonna compare the result to a default sklearn KNN using KNeighborsClassifier. We'll have to reverse the labels into integers and then fit to vectorized data.

In [136]:
reverse_dict = {v:k for k,v in lab_dict.items()}
reverse_dict

In [137]:
y = np.vectorize(reverse_dict.get)(train_df.Category)
y

In [138]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X, y)

In [139]:
knn_preds = knn_model.predict(X)
knn_labels = np.vectorize(lab_dict.get)(knn_preds) 

In [140]:
acc_knn = np.sum(knn_labels == train_df.Category)/len(knn_labels)
acc_knn

In [141]:
X_test = vectorized.transform(test_df.Text)
knn_preds_test = knn_model.predict(X_test)
knn_labels_test = np.vectorize(lab_dict.get)(knn_preds_test) 

In [143]:
pd.DataFrame(data={'ArticleId':test_df.ArticleId, 'Category':knn_labels_test}).to_csv('submission2.csv', index = False)

KNN Test Score: 0.96190

| model | train_accuracy | test_accuracy
|----|-----|------|
|NMF|0.9369|0.93877|
|KNN|0.9758|0.96190|

Assuming this is done right and on data already seen, NMF actually does worse than default KNN. Since default KNN uses 5 neighbors, it is very likely to be overfitting. It really seems that TfidfVectorizer is doing all the heavy lifting. Now on unseen data with unseen text words, NMF score slightly improved, but that while KNN slightly dipped, but still seems reasonably better than NMF. Perhaps a larger K for neighbors would increase accuracy since it may be overfitting. There's submission limitations and the test data provided does not have category labels provided so it would be difficult to do some parameter searching without splitting up the train set further to allow for validation.

Citation: https://predictivehacks.com/topic-modelling-with-nmf-in-python/