In [1]:
from __future__ import print_function

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.metrics as metrics
from sklearn.datasets import fetch_20newsgroups, fetch_rcv1
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
from sklearn.pipeline import make_pipeline
from sklearn.cluster import *

In [2]:
USE_IDF      = True
USE_HASH     = True
MAX_FEATURES = 100000

In [3]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
labels = dataset.target
true_k = np.unique(labels).shape[0]

print(len(dataset.data))
print("True k: ", true_k)

18846
True k:  20


# Preprocessing

http://brandonrose.org/clustering

In [64]:
import nltk
import re
from collections import defaultdict

def filter_regex(words, regex):
    w = pd.Series(words)
    return w[w.str.match(regex)].values

def remove_nonwords(words):
    return filter_regex(words, '^[0-9a-zA-Z]+$')

def remove_stops(words, language='english'):
    stops = nltk.corpus.stopwords.words(language)
    return np.array(words)[ ~np.in1d(words, stops) ]

def tokenize(text):
    text      = " ".join(text.split("\n\n")[1:]).lower()
    tokenizer = nltk.word_tokenize 
    return tokenizer(text)

def stemmize(words, language='english'):
    stemmer = nltk.stem.snowball.SnowballStemmer(language)
    return map(stemmer.stem, words)

# vectorizers

def _count_words(docs):
    df    = defaultdict(lambda: 0)
    vecs  = []
    for words in docs:
        tf = defaultdict(lambda: 0)
        for w   in words:          tf[w] += 1
        for w,c in tf.iteritems(): df[w] += 1
        vecs.append(tf)
    for w,c in df.iteritems():
        df[w] = np.log(len(docs) / float(c))
    return vecs, df

def tf_idf(docs, df=None):
    tf, _df = _count_words(docs)
    if df is None: df = _df
    
    vecs  = []
    vocab = sorted(df.keys())
    for words in tf:
        len_doc = np.sum(words.values())
        vec     = [ words[w] * df[w] / len_doc for w in vocab ]
        vecs.append(vec)
    return vecs, vocab
    

In [73]:
docs = map(tokenize, dataset.data[:3])
docs = map(remove_stops, docs)
docs = map(remove_nonwords, docs)
# docs = map(stemmize, docs)


[[0.0,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0058763059146110779,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.015921917227074055,
  0.015921917227074055,
  0.0,
  0.047765751681222164,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0058763059146110779,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.031843834454148109,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.0058763059146110779,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.0,
  0.015921917227074055,
  0.0,
  0.031843834454148109,
  0.015921917227074055,
  0.015921917227074055,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.011752611829

In [70]:
vec = CountVectorizer(stop_words='english')
x   = vec.fit_transform(dataset.data[:3]).todense()
len(vec.get_feature_names()), vec.get_feature_names()

(312,
 [u'12',
  u'1280',
  u'21',
  u'2mb',
  u'562',
  u'60',
  u'95',
  u'accurate',
  u'actually',
  u'aghdam',
  u'air',
  u'alias',
  u'andrew',
  u'ankara',
  u'announced',
  u'appeared',
  u'april',
  u'area',
  u'armenia',
  u'armenian',
  u'armenians',
  u'armeniaxn',
  u'arms',
  u'army',
  u'ati',
  u'attack',
  u'away',
  u'azerbadjan',
  u'azerbaijan',
  u'azeri',
  u'azeris',
  u'babylon',
  u'baku',
  u'bashers',
  u'beat',
  u'better',
  u'big',
  u'bit',
  u'black',
  u'bombing',
  u'border',
  u'bowman',
  u'bread',
  u'brother',
  u'bus',
  u'butter',
  u'calendar',
  u'called',
  u'came',
  u'card',
  u'care',
  u'carnegie',
  u'change',
  u'changed',
  u'children',
  u'cmu',
  u'cold',
  u'computer',
  u'confused',
  u'content',
  u'convention',
  u'conversations',
  u'correct',
  u'correspondent',
  u'couple',
  u'davidian',
  u'delete',
  u'demirel',
  u'dept',
  u'devils',
  u'devineni',
  u'diamond',
  u'disappointed',
  u'does',
  u'don',
  u'dream',
  u'dsv'

### Counting

In [None]:
vec   = CountVectorizer(stop_words='english', max_features=MAX_FEATURES)
X     = count.fit_transform(dataset.data)
vocab = vec.get_feature_names()

### TF IDF

In [None]:
hasher = HashingVectorizer(stop_words='english', non_negative=True, norm=None,
                           binary=False, n_features=MAX_FEATURES)
vec    = make_pipeline(hasher, TfidfTransformer())
X      = vec.fit_transform(dataset.data)

In [51]:
vec   = TfidfVectorizer(tokenizer=tokenize,  max_df=0.5, min_df=.2, stop_words='english',
                        use_idf=True, max_features=MAX_FEATURES)
X     = vec.fit_transform(dataset.data)
vocab = vec.get_feature_names()

## LDA

In [73]:
from gensim.corpora import *
from gensim.models.ldamulticore import *
from gensim.models.ldamodel import *

texts = [ tokenize(text) for text in dataset.data ]
print("Parsed %d documents" % len(texts))

dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.6)
corpus = [dictionary.doc2bow(text) for text in texts]

print("Starting trainning")

%time ldamodel = LdaModel(corpus, num_topics=true_k, id2word=dictionary, passes=20)
pd.DataFrame(ldamodel.print_topics(num_topics=true_k, num_words=3))

Parsed 18846 documents
Starting trainning
CPU times: user 7min 9s, sys: 2.45 s, total: 7min 12s
Wall time: 7min 16s


Unnamed: 0,0,1
0,0,"0.033*""key"" + 0.022*""use"" + 0.016*""encrypt"""
1,1,"0.040*""israel"" + 0.034*""jew"" + 0.029*""isra"""
2,2,"0.021*""drive"" + 0.016*""use"" + 0.015*""card"""
3,3,"0.032*""x"" + 0.022*""imag"" + 0.022*""file"""
4,4,"0.013*""use"" + 0.011*""write"" + 0.009*""articl"""
5,5,"0.029*""window"" + 0.021*""use"" + 0.013*""run"""
6,6,"0.021*""space"" + 0.011*""orbit"" + 0.009*""launch"""
7,7,"0.016*""gun"" + 0.011*""peopl"" + 0.010*""articl"""
8,8,"0.016*""fire"" + 0.012*""would"" + 0.012*""write"""
9,9,"0.016*""car"" + 0.011*""write"" + 0.011*""get"""


#### TODO
* how to assess performance
* optimize parameters(k, filters, ...)
* visualize results
* predict new instances
* explore library
* train bigdata

## NMF

https://de.dariah.eu/tatom/topic_model_python.html

In [None]:
X = X_c

for i in [true_k-2, true_k-1,true_k, true_k+1, true_k+2]:
    nm = NMF(n_components=num_topics, random_state=1)
    docs = nm.fit_transform(dtm)
    
#     topic_words = []

#     for topic in clf.components_:
#         word_idx = np.argsort(topic)[::-1][0:num_top_words]
#         topic_words.append([vocab[i] for i in word_idx])

## kmeans

In [52]:
X = X_hi

for i in [true_k-2, true_k-1,true_k, true_k+1, true_k+2]:
    km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)
    km.fit(X)
    print("======= ", i)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_))


Homogeneity: 0.288
Completeness: 0.388
V-measure: 0.331
Adjusted Rand-Index: 0.092
Silhouette Coefficient: 0.006
Homogeneity: 0.312
Completeness: 0.375
V-measure: 0.340
Adjusted Rand-Index: 0.119
Silhouette Coefficient: 0.006
Homogeneity: 0.260
Completeness: 0.315
V-measure: 0.285
Adjusted Rand-Index: 0.099
Silhouette Coefficient: 0.006
Homogeneity: 0.295
Completeness: 0.345
V-measure: 0.318
Adjusted Rand-Index: 0.113
Silhouette Coefficient: 0.007
Homogeneity: 0.277
Completeness: 0.337
V-measure: 0.304
Adjusted Rand-Index: 0.094
Silhouette Coefficient: 0.007


In [None]:
# if USE_LSA:
#     original_space_centroids = svd.inverse_transform(km.cluster_centers_)
#     order_centroids = original_space_centroids.argsort()[:, ::-1]

print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vec.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

## Fuzzy kmeans

https://gist.github.com/mblondel/1451300