In [1]:
import string
import collections
 
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

from sklearn.datasets import fetch_20newsgroups as news

In [2]:
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens

In [3]:
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [4]:
data = news(subset='all')

In [7]:
articles = data.data
clusters = cluster_texts(articles, 7) # cluster articles in seven different clusters
for key, value in dict(clusters).items(): # print number of articles in every cluster
    print('The topic {} has {} articles'.format(key,len(value)))

The topic 0 has 1668 articles
The topic 1 has 1247 articles
The topic 2 has 2328 articles
The topic 3 has 5813 articles
The topic 4 has 976 articles
The topic 5 has 5568 articles
The topic 6 has 1246 articles
