In [1]:
# Imports
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

First I split all the texts into tokens and reduce common words to stems.

In [2]:
# Stemmed Tokenization
bbc = pd.read_csv(r"C:\Users\m.al-zadid\Downloads\bbc.csv")
stemmer = SnowballStemmer("english")

def tokenize_stem(text):
    tokens = word_tokenize(str(text))
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return(stemmed_tokens)

bbc["tokens"] = bbc["text"].apply(tokenize_stem)
bbc["tokens"].head()

0    [ad, sale, boost, time, warner, profit, quarte...
1    [dollar, gain, on, greenspan, speech, the, dol...
2    [yuko, unit, buyer, face, loan, claim, the, ow...
3    [high, fuel, price, hit, ba, 's, profit, briti...
4    [pernod, takeov, talk, lift, domecq, share, in...
Name: tokens, dtype: object

Then I isolate $15\%$ of the terms with lowest frequency and keep $85\%$.

In [3]:
# Dropping lowest frequency terms
from collections import Counter 

bbc_counter = Counter()

for article_tokens in bbc["tokens"]:
    bbc_counter.update(set(article_tokens))

term_freq = pd.Series(bbc_counter)

least_freq = term_freq.quantile(0.15)
keep_terms = set(term_freq[term_freq > least_freq].index)

bbc["tokens_filtered"] = bbc["tokens"].apply(
    lambda article_tokens: [t for t in article_tokens 
                            if t in keep_terms]
)

Here I create the Document Term Matrix. 

In [4]:
# Document Term Matrix
from sklearn.feature_extraction import DictVectorizer

docs_counts = [Counter(article_tokens) 
               for article_tokens in bbc["tokens_filtered"]]

vec = DictVectorizer(sparse = True)

X = vec.fit_transform(docs_counts) # X is my DTM

Finally, I print the feature vector of the words (with their corresponding frequencies) that appear 4 or more times in the 2121st article in the dataset. As Python uses 0-based indexing, the row-index will be 2120.

In [5]:
# Create feature vector
row = X[2120].toarray().ravel()
condition = row >= 4

words = vec.get_feature_names_out()

result = [(words[i],row[i]) for i in range(len(words)) 
          if condition[i]]

print(result)

[('$', 4.0), ("''", 6.0), (',', 20.0), ('-', 5.0), ('.', 21.0), ('``', 7.0), ('a', 15.0), ('and', 10.0), ('appl', 6.0), ('as', 5.0), ('by', 5.0), ('comput', 6.0), ('for', 8.0), ('had', 4.0), ('in', 7.0), ('ipod', 8.0), ('it', 7.0), ('job', 7.0), ('mac', 8.0), ('mini', 4.0), ('mr', 4.0), ('new', 8.0), ('of', 9.0), ('said', 6.0), ('the', 26.0), ('to', 6.0), ('was', 4.0), ('will', 7.0), ('with', 4.0)]


Now I perform Chi-Square feature selection to the DTM to keep top 5000 terms that shows the strongest relationship with the article categories. 

In [6]:
# feature selection
# chi-square

from sklearn.feature_selection import SelectKBest, chi2

chi2_selector = SelectKBest(chi2, k=5000)
X_chi2 = chi2_selector.fit_transform(X, bbc["category"])

print(X_chi2.shape)

(2225, 5000)


Here I train and test with different combinations $(80/20, 75/25, 70/30)\%$ of the data. And then I perform Naive Bayes and Logistic Regression to predict the article cateogries on test data by training the train data.

In [7]:
# train-test, naive bayes, logistic regression

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

split = [0.2, 0.25, 0.3] #80-20, 75-25, 70-30
y = bbc["category"]

for ts in split:
    X_train, X_test, y_train, y_test = train_test_split(
    X_chi2, bbc["category"], test_size = ts, 
        stratify = y, random_state = 42
)

nb = MultinomialNB().fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

lr = LogisticRegression(multi_class = "multinomial", 
                        max_iter = 1000).fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)




Here I create the confusion matrices for both models. 

In [8]:
# confusion matrix

conf_matrix_nb = pd.DataFrame(confusion_matrix(
    y_test, y_pred_nb, labels = nb.classes_), 
                              index = nb.classes_, 
                              columns = nb.classes_)

conf_matrix_lr = pd.DataFrame(confusion_matrix(
    y_test, y_pred_lr, labels = lr.classes_), 
                              index = lr.classes_, 
                              columns = lr.classes_)

print('\nConfusion Matrix for Naive Bayes:')
print(conf_matrix_nb)
print('\nConfusion Matrix for Logistic Regression:')
print(conf_matrix_lr)


Confusion Matrix for Naive Bayes:
               business  entertainment  politics  sport  tech
business            147              0         3      0     3
entertainment         1            113         0      0     2
politics              4              1       119      0     1
sport                 0              1         2    151     0
tech                  1              0         0      0   119

Confusion Matrix for Logistic Regression:
               business  entertainment  politics  sport  tech
business            151              0         2      0     0
entertainment         1            113         1      0     1
politics              5              3       116      1     0
sport                 1              0         0    153     0
tech                  2              0         0      0   118


Finally, I calculate precision and recall scores for both models.

In [9]:
# precision & recall scores for Naive Bayes
prec, rec, _, _ = precision_recall_fscore_support(
    y_test, y_pred_nb, 
    labels = nb.classes_, zero_division = 0
)

scores_nb = pd.DataFrame({'precision': prec, 'recall': rec}, 
                         index = nb.classes_)
print('\nPrecision and Recall per Class by Naive Bayes:')
print(scores_nb)


Precision and Recall per Class by Naive Bayes:
               precision    recall
business        0.960784  0.960784
entertainment   0.982609  0.974138
politics        0.959677  0.952000
sport           1.000000  0.980519
tech            0.952000  0.991667


In [10]:
# precision & recall scores for Logistic Regression
prec, rec, _, _ = precision_recall_fscore_support(
    y_test, y_pred_lr, labels = lr.classes_, zero_division = 0
)

scores_lr = pd.DataFrame({'precision': prec, 'recall': rec}, 
                         index = lr.classes_)
print('\nPrecision and Recall per Class by Logistic Regression:')
print(scores_lr)



Precision and Recall per Class by Logistic Regression:
               precision    recall
business        0.943750  0.986928
entertainment   0.974138  0.974138
politics        0.974790  0.928000
sport           0.993506  0.993506
tech            0.991597  0.983333
