In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from time import time
from sklearn.datasets import load_files

print("loading documents ...")
t = time()
docs = load_files('data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds".format(time() - t))

loading documents ...
summary: 7898 documents in 4 categories.
done in 1.2990741729736328 seconds


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4, 
                             min_df=2, 
                             max_features=max_features, 
                             encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print("n_samples: %d, n_features: %d" % X.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    docs.filenames[0], X[0].getnnz()))
print("done in {0} seconds".format(time() - t))

vectorizing documents ...
n_samples: 7898, n_features: 20000
number of non-zero features in sample [data\sci.electronics\._12249-54259]: 0
done in 1.0180583000183105 seconds


In [4]:
from sklearn.cluster import KMeans

print("clustering documents ...")
t = time()
n_clusters = 4
kmean = KMeans(n_clusters=n_clusters, 
               max_iter=100,
               tol=0.01,
               verbose=1,
               n_init=3)
kmean.fit(X);
print("kmean: k={}, cost={}".format(n_clusters, int(kmean.inertia_)))
print("done in {0} seconds".format(time() - t))

clustering documents ...
Initialization complete
Iteration 0, inertia 3945.0726985741767
Iteration 1, inertia 3846.8833001124417
Iteration 2, inertia 3846.084431659387
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 3940.437835481184
Iteration 1, inertia 3836.424694660833
Iteration 2, inertia 3833.4219715799386
Iteration 3, inertia 3832.311110489677
Iteration 4, inertia 3832.0239038488953
Iteration 5, inertia 3831.9542101315897
Iteration 6, inertia 3831.8704741178003
Iteration 7, inertia 3831.808042972564
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 3944.4676170688795
Iteration 1, inertia 3846.5822821417905
Iteration 2, inertia 3846.3665926444532
Iteration 3, inertia 3846.2096337595704
Converged at iteration 3: strict convergence.
kmean: k=4, cost=3831
done in 1.3730785846710205 seconds


In [5]:
len(kmean.labels_)

7898

In [6]:
kmean.labels_[1000:1010]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [7]:
docs.filenames[1000:1010]

array(['data\\sci.crypt\\11475-15954', 'data\\sci.med\\._13133-59218',
       'data\\sci.med\\._13072-59582', 'data\\sci.crypt\\11228-15855',
       'data\\sci.med\\._13131-58806', 'data\\sci.space\\14343-60918',
       'data\\sci.space\\14001-60226', 'data\\sci.space\\._14348-61339',
       'data\\sci.space\\._14390-61342',
       'data\\sci.electronics\\._12203-54305'], dtype='<U34')

In [9]:
from __future__ import print_function

print("Top terms per cluster:")

order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: it that for you edu be this on have are
Cluster 1: oort cloud distribution gamma bursters are they ray that mechanism
Cluster 2: afit af element elements mil tkelso ts kelso shuttle celestial
Cluster 3: uga mcovingt covington ai georgia michael 706 542 0358 30602


In [10]:
a = np.array([[20, 10, 30, 40], [100, 300, 200, 400], [1, 5, 3, 2]])
a.argsort()[:, ::-1]

array([[3, 2, 0, 1],
       [3, 1, 2, 0],
       [1, 2, 3, 0]], dtype=int64)

In [11]:
a = np.array([10, 30, 20, 40])
a.argsort()[::-1]

array([3, 1, 2, 0], dtype=int64)

In [12]:
from sklearn import metrics

label_true = np.random.randint(1, 4, 6)
label_pred = np.random.randint(1, 4, 6)
print("Adjusted Rand-Index for random sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))
label_true = [1, 1, 3, 3, 2, 2]
label_pred = [3, 3, 2, 2, 1, 1]
print("Adjusted Rand-Index for same structure sample: %.3f"
      % metrics.adjusted_rand_score(label_true, label_pred))

Adjusted Rand-Index for random sample: -0.176
Adjusted Rand-Index for same structure sample: 1.000
