In [1]:
import re
import pandas as pd
from typing import Set
from itertools import islice

from nltk.corpus import stopwords, gutenberg
from nltk.tokenize import wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

from gensim.models import Phrases, Nmf
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary

### Loading and displaying basic information of the BBC News dataset

In [2]:
df = pd.read_csv('data/bbc-news-data.csv', delimiter='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   filename  2225 non-null   object
 2   title     2225 non-null   object
 3   content   2225 non-null   object
dtypes: object(4)
memory usage: 69.7+ KB


In [3]:
df.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
5,business,006.txt,Japan narrowly escapes recession,Japan's economy teetered on the brink of a te...
6,business,007.txt,Jobs growth still slow in the US,The US created fewer jobs than expected in Ja...
7,business,008.txt,India calls for fair trade rules,"India, which attends the G7 meeting of seven ..."
8,business,009.txt,Ethiopia's crop production up 24%,Ethiopia produced 14.27 million tonnes of cro...
9,business,010.txt,Court rejects $280bn tobacco case,A US government claim accusing the country's ...


In [4]:
df['category'].value_counts()

category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

### Preprocessing text data and displaying the first 10 rows of the processed dataset

In [5]:
def preprocess_text(text: str, stop_words: Set[str]) -> str:
    text = re.sub(r'[^a-zA-Z\s]', '', text, flags=re.IGNORECASE).lower().strip()
    tokens = wordpunct_tokenize(text)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return filtered_tokens

stoplist = set(stopwords.words('english'))
df['content'] = df['content'].apply(lambda doc: preprocess_text(doc, stoplist))
df.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,"[quarterly, profits, us, media, giant, timewar..."
1,business,002.txt,Dollar gains on Greenspan speech,"[dollar, hit, highest, level, euro, almost, th..."
2,business,003.txt,Yukos unit buyer faces loan claim,"[owners, embattled, russian, oil, giant, yukos..."
3,business,004.txt,High fuel prices hit BA's profits,"[british, airways, blamed, high, fuel, prices,..."
4,business,005.txt,Pernod takeover talk lifts Domecq,"[shares, uk, drinks, food, firm, allied, domec..."
5,business,006.txt,Japan narrowly escapes recession,"[japans, economy, teetered, brink, technical, ..."
6,business,007.txt,Jobs growth still slow in the US,"[us, created, fewer, jobs, expected, january, ..."
7,business,008.txt,India calls for fair trade rules,"[india, attends, g, meeting, seven, leading, i..."
8,business,009.txt,Ethiopia's crop production up 24%,"[ethiopia, produced, million, tonnes, crops, h..."
9,business,010.txt,Court rejects $280bn tobacco case,"[us, government, claim, accusing, countrys, bi..."


### Applying phrase detection to the preprocessed text data

In [6]:
phrases = Phrases(df['content'], min_count=20, threshold=20.0)
phraser = Phraser(phrases)

df['content'] = df['content'].apply(lambda tokens: phraser[tokens])
df.head(10)

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,"[quarterly, profits, us, media, giant, timewar..."
1,business,002.txt,Dollar gains on Greenspan speech,"[dollar, hit, highest, level, euro, almost, th..."
2,business,003.txt,Yukos unit buyer faces loan claim,"[owners, embattled, russian, oil, giant, yukos..."
3,business,004.txt,High fuel prices hit BA's profits,"[british, airways, blamed, high, fuel, prices,..."
4,business,005.txt,Pernod takeover talk lifts Domecq,"[shares, uk, drinks, food, firm, allied, domec..."
5,business,006.txt,Japan narrowly escapes recession,"[japans, economy, teetered, brink, technical, ..."
6,business,007.txt,Jobs growth still slow in the US,"[us, created, fewer, jobs, expected, january, ..."
7,business,008.txt,India calls for fair trade rules,"[india, attends, g, meeting, seven, leading, i..."
8,business,009.txt,Ethiopia's crop production up 24%,"[ethiopia, produced, million, tonnes, crops, h..."
9,business,010.txt,Court rejects $280bn tobacco case,"[us, government, claim, accusing, countrys, bi..."


### Creating a corpora.Dictionary from the text data and filtering extremes

In [7]:
dictionary = Dictionary(df['content'])
print(f'Dictionary size before: {len(dictionary)}')
dictionary.filter_extremes(no_below=20, no_above=0.6)
print(f'Dictionary size after: {len(dictionary)}\n')

for key, value in list(islice(dictionary.items(), 15)):
    print(f"Key: {key}, Value: {value}")

Dictionary size before: 31470
Dictionary size after: 3361

Key: 0, Value: accounts
Key: 1, Value: advertising
Key: 2, Value: alexander
Key: 3, Value: already
Key: 4, Value: also
Key: 5, Value: amount
Key: 6, Value: analysts
Key: 7, Value: around
Key: 8, Value: aside
Key: 9, Value: back
Key: 10, Value: better
Key: 11, Value: biggest
Key: 12, Value: bn
Key: 13, Value: bn_bn
Key: 14, Value: book


### Converting text data into Bag-of-Words format

In [8]:
bow_corpus = [dictionary.doc2bow(t) for t in df['content']]

### Applying Non-negative Matrix Factorization (NMF) for topic modeling and displaying Top 5 topics

In [9]:
num_topics = 5
nmf = Nmf(bow_corpus, id2word=dictionary, num_topics=num_topics, random_state=0)

for topic in nmf.print_topics(num_topics=num_topics, num_words=5):
    print(topic)

(0, '0.013*"would" + 0.008*"game" + 0.007*"good" + 0.007*"get" + 0.006*"time"')
(1, '0.017*"mr" + 0.013*"us" + 0.011*"could" + 0.007*"one" + 0.006*"government"')
(2, '0.015*"would" + 0.009*"mr" + 0.008*"also" + 0.007*"bn" + 0.007*"new"')
(3, '0.012*"people" + 0.009*"new" + 0.008*"one" + 0.008*"would" + 0.007*"technology"')
(4, '0.022*"music" + 0.015*"people" + 0.010*"also" + 0.008*"world" + 0.007*"year"')


### Calculating topic contributions for each document and displaying the document with the highest contribution for each topic

In [10]:
contributions = {i: [] for i in range(num_topics)}

for doc_id, doc in enumerate(bow_corpus):
    for topic_id, contribution in nmf.get_document_topics(doc):
        contributions[topic_id].append((contribution, doc_id))

for topic_id, docs in contributions.items():
    doc_id = max(docs)[1]
    print(f"Topic #{topic_id}, Document #{doc_id}, Contribution: {max(docs)[0]}")

Topic #0, Document #1719, Contribution: 1.0
Topic #1, Document #2122, Contribution: 0.9969749802568253
Topic #2, Document #268, Contribution: 0.9353737872916402
Topic #3, Document #2201, Contribution: 0.9467166547706678
Topic #4, Document #2006, Contribution: 1.0


### Applying topic modeling to new documents and displaying topic probabilities

In [11]:
new_docs = [
    "Python is a high-level, interpreted programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy which emphasizes code readability, notably using significant whitespace.",
    "The 2020 United States presidential election was the 59th quadrennial presidential election, held on Tuesday, November 3, 2020. The Democratic ticket of former vice president Joe Biden and the junior U.S. senator from California Kamala Harris defeated the incumbent president Donald Trump and vice president Mike Pence.",
    "Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. Unqualified, the word football normally means the form of football that is the most popular where the word is used."
]

preprocessed_docs = [preprocess_text(doc, stoplist) for doc in new_docs]
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

for doc_id, doc in enumerate(bow_corpus):
    print(f"Document #{doc_id}")
    for topic_id, prob in nmf.get_document_topics(doc):
        print(f"Topic #{topic_id}, Probability: {prob}")
    print()

Document #0
Topic #0, Probability: 0.022886172091115767
Topic #1, Probability: 0.5078263616972731
Topic #4, Probability: 0.4692874662116112

Document #1
Topic #0, Probability: 0.08239070129444707
Topic #1, Probability: 0.3359500474462987
Topic #2, Probability: 0.44542199104031266
Topic #4, Probability: 0.13623726021894159

Document #2
Topic #0, Probability: 0.7759016673880168
Topic #4, Probability: 0.21759781630175257



### Extracting top 10 bigrams from 'Chesterton-Brown' text using Likelihood Ratio measure

In [12]:
text = gutenberg.raw('chesterton-brown.txt')
tokens = preprocess_text(text, stoplist)

finder = BigramCollocationFinder.from_words(tokens)
bigrams = finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

bigrams

[('father', 'brown'),
 ('said', 'father'),
 ('mr', 'glass'),
 ('dr', 'hood'),
 ('sir', 'claude'),
 ('sir', 'wilson'),
 ('mr', 'todhunter'),
 ('dont', 'know'),
 ('pilgrims', 'pond'),
 ('wilson', 'seymour')]