Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)

Chapter 8 Discovering Underlying Topics in the Newsgroups Dataset with Clustering and Topic Modeling

Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)

# Clustering newsgroups dataset

## Clustering newsgroups data using k-means 

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

groups = fetch_20newsgroups(subset='all', categories=categories)

labels = groups.target
label_names = groups.target_names


In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def get_cleaned_data(groups, lemmatizer, remove_words):
    data_cleaned = []

    for doc in groups.data:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in remove_words)
        data_cleaned.append(doc_cleaned)
        
    return data_cleaned

data_cleaned = get_cleaned_data(groups, lemmatizer, all_names)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
data_cv = count_vector.fit_transform(data_cleaned)


In [4]:
from sklearn.cluster import KMeans
k = 4
kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)

kmeans.fit(data_cv)


In [5]:
clusters = kmeans.labels_

from collections import Counter
print(Counter(clusters))

Counter({3: 3360, 0: 17, 1: 7, 2: 3})


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)


In [7]:
data_tv = tfidf_vector.fit_transform(data_cleaned)
kmeans.fit(data_tv)
clusters = kmeans.labels_
print(Counter(clusters))

Counter({1: 1478, 2: 797, 0: 601, 3: 511})


In [8]:
import numpy as np
cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}

terms = tfidf_vector.get_feature_names_out()
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
    counter = Counter(cluster_label[cluster])
    print(f'cluster_{cluster}: {len(index_list)} samples')
    for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True):
        print(f'- {label_names[label_index]}: {count} samples')
    print('Top 10 terms:')
    for ind in centroids[cluster].argsort()[-10:]:
        print('%s ' % terms[ind], end="")
    print('\n')


cluster_0: 601 samples
- sci.space: 598 samples
- alt.atheism: 1 samples
- talk.religion.misc: 1 samples
- comp.graphics: 1 samples
Top 10 terms:
just orbit moon hst nasa mission launch wa shuttle space 

cluster_1: 1478 samples
- alt.atheism: 522 samples
- talk.religion.misc: 387 samples
- sci.space: 338 samples
- comp.graphics: 231 samples
Top 10 terms:
say people know like think ha just university wa article 

cluster_2: 797 samples
- comp.graphics: 740 samples
- sci.space: 49 samples
- talk.religion.misc: 5 samples
- alt.atheism: 3 samples
Top 10 terms:
computer need know looking thanks university program file graphic image 

cluster_3: 511 samples
- alt.atheism: 273 samples
- talk.religion.misc: 235 samples
- sci.space: 2 samples
- comp.graphics: 1 samples
Top 10 terms:
doe bible think believe say people christian jesus wa god 



## Describing the clusters using GPT 

In [9]:
keywords = ' '.join(terms[ind] for ind in centroids[0].argsort()[-100:])  

In [10]:
print(keywords)

big power vehicle using alaska look mass money marketing company loss pluto russian scheduled office express probably research software funding billboard online pat access doe telescope april jet usa digest light want prize forwarded way large mar project sci center command technology air government commercial good work servicing know going comet world propulsion people idea design data university day international use orbital long science need time sky program thing make spencer new year earth spacecraft flight henry billion rocket think ha station lunar solar like cost satellite article toronto zoology just orbit moon hst nasa mission launch wa shuttle space


In [11]:
import openai

In [12]:
# openai.api_key = '<YOUR API KEY>'

In [13]:
def get_completion(prompt, model="text-davinci-003"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message["content"]

In [14]:
# response = get_completion(f"Describe a common topic based on the following keywords: {keywords}")
# print(response)

# Discovering underlying topics in newsgroups 

## Topic modeling using NMF 

In [15]:
from sklearn.decomposition import NMF

t = 20
nmf = NMF(n_components=t, random_state=42)

In [16]:
nmf.fit(data_cv)

print(nmf.components_)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.82524532e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.77697392e-04 3.85995474e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 2.71332203e-02
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 4.31048632e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [17]:
terms_cv = count_vector.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
        print("Topic {}:" .format(topic_idx))
        print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]]))

Topic 0:
available quality program free color version gif file image jpeg
Topic 1:
ha article make know doe say like just people think
Topic 2:
include available analysis user software ha processing data tool image
Topic 3:
atmosphere kilometer surface ha earth wa planet moon spacecraft solar
Topic 4:
communication technology venture service market ha commercial space satellite launch
Topic 5:
verse wa jesus father mormon shall unto mcconkie lord god
Topic 6:
format message server object image mail file ray send graphic
Topic 7:
christian people doe atheism believe religion belief religious god atheist
Topic 8:
file graphic grass program ha package ftp available image data
Topic 9:
speed material unified star larson book universe theory physicist physical
Topic 10:
planetary station program group astronaut center mission shuttle nasa space
Topic 11:
infrared high astronomical center acronym observatory satellite national telescope space
Topic 12:
used occurs true form ha ad premise con

## Topic modeling using LDA 

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

t = 20
lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42)

In [19]:
lda.fit(data_cv)

print(lda.components_)

[[0.05      2.05      2.05      ... 0.05      0.05      0.05     ]
 [0.05      0.05      0.05      ... 0.05      0.05      0.05     ]
 [0.05      0.05      0.05      ... 4.0336285 0.05      0.05     ]
 ...
 [0.05      0.05      0.05      ... 0.05      0.05      0.05     ]
 [0.05      0.05      0.05      ... 0.05      0.05      0.05     ]
 [0.05      0.05      0.05      ... 0.05      0.05      3.05     ]]


In [20]:
for topic_idx, topic in enumerate(lda.components_):
        print("Topic {}:" .format(topic_idx))
        print(" ".join([terms_cv[i] for i in topic.argsort()[-10:]]))

Topic 0:
atheist doe ha believe say jesus people christian wa god
Topic 1:
moment just adobe want know ha wa hacker article radius
Topic 2:
center point ha wa available research computer data graphic hst
Topic 3:
objective argument just thing doe people wa think say article
Topic 4:
time like brian ha good life want know just wa
Topic 5:
computer graphic think know need university just article wa like
Topic 6:
free program color doe use version gif jpeg file image
Topic 7:
gamma ray did know university ha just like article wa
Topic 8:
tool ha processing using data software color program bit image
Topic 9:
apr men know ha think woman just university article wa
Topic 10:
jpl propulsion mission april mar jet command data spacecraft wa
Topic 11:
russian like ha university redesign point option article space station
Topic 12:
ha van book star material physicist universe physical theory wa
Topic 13:
bank doe book law wa article rushdie muslim islam islamic
Topic 14:
think gopher routine poin

---

Readers may ignore the next cell.

In [21]:
!jupyter nbconvert --to python ch8_part2.ipynb --TemplateExporter.exclude_input_prompt=True

[NbConvertApp] Converting notebook ch8_part2.ipynb to python
[NbConvertApp] Writing 4498 bytes to ch8_part2.py
