In [1]:
import pandas as pd

In [5]:
bbc=pd.read_csv("./BBC News Train.csv")

In [6]:
bbc.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf =TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [9]:
dtm=tfidf.fit_transform(bbc['Text'])

In [10]:
dtm

<1490x14066 sparse matrix of type '<class 'numpy.float64'>'
	with 203476 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.decomposition import NMF

In [12]:
nmf_model=NMF(n_components=5,random_state=42)

In [13]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [14]:
len(tfidf.get_feature_names())

14066

In [27]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{topic_names[index]}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #sport
['world', 'match', 'chelsea', 'half', 'france', 'players', 'play', 'team', 'said', 'cup', 'ireland', 'wales', 'win', 'game', 'england']


THE TOP 15 WORDS FOR TOPIC #politics
['tories', 'tory', 'chancellor', 'tax', 'prime', 'howard', 'minister', 'government', 'said', 'party', 'brown', 'blair', 'election', 'labour', 'mr']


THE TOP 15 WORDS FOR TOPIC #tech
['service', 'use', 'net', 'software', 'broadband', 'microsoft', 'users', 'phones', 'technology', 'digital', 'phone', 'said', 'music', 'people', 'mobile']


THE TOP 15 WORDS FOR TOPIC #entertainment
['nominated', 'star', 'comedy', 'aviator', 'festival', 'films', 'won', 'director', 'oscar', 'actress', 'actor', 'award', 'awards', 'best', 'film']


THE TOP 15 WORDS FOR TOPIC #business
['rate', 'prices', 'rates', 'china', 'dollar', '2004', 'oil', 'market', 'bank', 'year', 'economic', 'sales', 'economy', 'said', 'growth']




In [26]:
topic_names=["sport","politics","tech","entertainment","business"]

In [18]:
dtm

<1490x14066 sparse matrix of type '<class 'numpy.float64'>'
	with 203476 stored elements in Compressed Sparse Row format>

In [19]:
topic_results=nmf_model.transform(dtm)

In [20]:
topic_results.shape

(1490, 5)

In [21]:
topic_results[0]

array([0.00071379, 0.03940298, 0.00873215, 0.00357328, 0.04990868])

In [22]:
topic_results[0].argmax()

4

In [23]:
bbc.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [28]:
bbc['predicted_category']=[topic_names[i] for i in topic_results.argmax(axis=1)]

In [29]:
bbc.head()

Unnamed: 0,ArticleId,Text,Category,predicted_category
0,1833,worldcom ex-boss launches defence lawyers defe...,business,business
1,154,german business confidence slides german busin...,business,business
2,1101,bbc poll indicates economic gloom citizens in ...,business,business
3,1976,lifestyle governs mobile choice faster bett...,tech,tech
4,917,enron bosses in $168m payout eighteen former e...,business,business


In [30]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [31]:
accuracy_score(bbc['predicted_category'],bbc['Category'])

0.9194630872483222

In [35]:
print(confusion_matrix(bbc['predicted_category'],bbc['Category']))

[[314   5  21   1   3]
 [  1 220   0   2   4]
 [ 12   6 244   0   1]
 [  0   4   2 343   4]
 [  9  38   7   0 249]]


In [37]:
print(classification_report(bbc['predicted_category'],bbc['Category']))

               precision    recall  f1-score   support

     business       0.93      0.91      0.92       344
entertainment       0.81      0.97      0.88       227
     politics       0.89      0.93      0.91       263
        sport       0.99      0.97      0.98       353
         tech       0.95      0.82      0.88       303

    micro avg       0.92      0.92      0.92      1490
    macro avg       0.92      0.92      0.92      1490
 weighted avg       0.92      0.92      0.92      1490



In [73]:
# General Purpose code 

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def topic_categorization(data, text_column, topic_names, max_df=0.95, min_df=2, stop_words='english', n_components=5, random_state=42):
   
    # Initialize TF-IDF vectorizer
    tfidf = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words)
    
    # Fit and transform the document-term matrix
    dtm = tfidf.fit_transform(data[text_column])
    
    # Initialize NMF model
    nmf_model = NMF(n_components=n_components, random_state=random_state)
    
    # Fit NMF model
    nmf_model.fit(dtm)
    
    # Extract top words for each topic
    for index, topic in enumerate(nmf_model.components_):
        print(f'The top 15 words for topic #{topic_names[index]}:')
        print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
        print('\n')
    
    # Perform topic categorization
    topic_results = nmf_model.transform(dtm)
    
    # Assign predicted categories to the data
    bbct['predicted_category'] = [topic_names[i] for i in topic_results.argmax(axis=1)]
 
 # Load the data
bbct = pd.read_csv("./BBC News Test.csv")

# Define topic names
topic_names = ["sport", "politics", "business", "tech", "entertainment"]

# Perform topic categorization
topic_categorization(bbc, 'Text', topic_names)

#print the newly formatted table
bbct.head()

The top 15 words for topic #sport:
['coach', 'ireland', 'injury', 'said', 'club', 'team', 'cup', 'players', 'match', 'wales', 'play', 'win', 'rugby', 'england', 'game']


The top 15 words for topic #politics:
['tories', 'tax', 'prime', 'tory', 'minister', 'government', 'chancellor', 'howard', 'party', 'said', 'brown', 'election', 'blair', 'labour', 'mr']


The top 15 words for topic #business:
['company', 'yukos', 'rate', 'market', 'china', '2004', 'oil', 'economic', 'sales', 'prices', 'year', 'bank', 'said', 'growth', 'economy']


The top 15 words for topic #tech:
['video', 'apple', 'tv', 'net', 'computer', 'phone', 'games', 'broadband', 'users', 'said', 'digital', 'technology', 'mobile', 'music', 'people']


The top 15 words for topic #entertainment:
['album', 'comedy', 'star', 'music', 'won', 'actress', 'films', 'year', 'prize', 'oscar', 'festival', 'awards', 'award', 'best', 'film']




Unnamed: 0,ArticleId,Text,predicted_category
0,1018,qpr keeper day heads for preston queens park r...,sport
1,1319,software watching while you work software that...,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,sport
3,459,india s reliance family feud heats up the ongo...,business
4,1020,boro suffer morrison injury blow middlesbrough...,sport
