In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import numpy as np
from scipy.cluster.vq import kmeans2
from scipy.spatial.distance import cdist

import datasets
import preprocessing

### Get data

In [5]:
# BBC corpus
bbc = datasets.get_bbc()

In [6]:
# bbc is a list of strings
type(bbc)

list

In [7]:
type(bbc[0])

str

In [8]:
len(bbc)

2225

In [9]:
# first 100 chars of the 1st doc
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [10]:
# pre-process and vectorize
processor = preprocessing.NLPProcessor('tf-idf')
bbc_vectorized = processor.fit_transform(bbc)

In [11]:
# sparse matrix
type(bbc_vectorized)

scipy.sparse.csr.csr_matrix

In [12]:
# 2,225 docs with 29,421 sparse features
bbc_vectorized.shape

(2225, 29421)

### Similarity metric

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# cosine similarity between 1st and 2nd doc
cosine_similarity(bbc_vectorized[0], bbc_vectorized[1])

array([[0.09372963]])

In [14]:
# cosine similarity of all 2,225 docs
# in a matrix of 2,225 x 2,225
cosine_similarity(bbc_vectorized)

array([[1.        , 0.09372963, 0.16603523, ..., 0.14290773, 0.23992781,
        0.18738048],
       [0.09372963, 1.        , 0.08616861, ..., 0.04780253, 0.09995931,
        0.08760382],
       [0.16603523, 0.08616861, 1.        , ..., 0.09247675, 0.17008746,
        0.1621905 ],
       ...,
       [0.14290773, 0.04780253, 0.09247675, ..., 1.        , 0.11614638,
        0.08908685],
       [0.23992781, 0.09995931, 0.17008746, ..., 0.11614638, 1.        ,
        0.15640773],
       [0.18738048, 0.08760382, 0.1621905 , ..., 0.08908685, 0.15640773,
        1.        ]])

In [None]:
# ...

### Clustering

In [182]:
from sklearn.cluster import KMeans

In [13]:
k = KMeans(n_clusters=5)

In [14]:
# this will take a while
# k.fit(bbc_vectorized)

In [16]:
# take only the top 1,000 most occurring tokens in the corpus 
p = preprocessing.NLPProcessor(max_features=1000)
bbc_vectorized_features_bound = p.fit_transform(bbc)

In [16]:
bbc_vectorized_features_bound.shape

(2225, 1000)

In [20]:
%%time
k.fit(bbc_vectorized_features_bound)

CPU times: user 1min 17s, sys: 641 ms, total: 1min 18s
Wall time: 26.3 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
k.predict(bbc_vectorized_features_bound)

array([2, 4, 0, ..., 2, 2, 0], dtype=int32)

In [94]:
# ...

### Interactive

In [51]:
import interactive

In [52]:
k = 10  # start with assuming there are 10 clusters
options = (1.1, 25, 0.01, 0)
userU = -1

In [53]:
data = bbc_vectorized_features_bound.todense()
terms = np.array(p.vec.get_feature_names()).reshape((1, 1000))

In [58]:
# run the algo with no user input
x = interactive.icluster(data, terms, [], k, userU=-1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x



In [59]:
# sample docs in clusters 0 to k-1
[x[:5] for x in clusterDocs]

[[1, 22, 25, 28, 37],
 [6, 7, 14, 36, 86],
 [5, 10, 11, 27, 30],
 [2, 29, 31, 35, 52],
 [12, 13, 40, 43, 50],
 [3, 4, 8, 9, 15],
 [16, 19, 20, 21, 32],
 [17, 18, 31, 38, 39],
 [31, 60, 71, 74, 76],
 [26, 33, 37, 41, 45]]

In [60]:
# there seem to be some legitimate clusters forming
# some overlap
# and some catchall cluster for all common words (need stop word removal?)
[x[:5] for x in clusterKeyterms]

[['digital', 'technology', 'video', 'users', 'mobile'],
 ['election', 'labour', 'party', 'leader', 'tory'],
 ['film', 'awards', 'award', 'actor', 'oscar'],
 ['law', 'court', 'ministers', 'secretary', 'lords'],
 ['growth', 'economy', 'prices', 'economic', 'rose'],
 ['game', 'cup', 'coach', 'match', 'injury'],
 ['firm', 'company', 'shares', 'financial', 'stock'],
 ['can', 'you', 'do', 'people', 'they'],
 ['irish', 'williams', 'fans', 'staff', 'rugby'],
 ['band', '000', 'show', 'chart', 'health']]

#### Get user input
Specify number of clusters and top terms

In [61]:
user_input = [x[:5] for x in clusterKeyterms]
user_input

[['digital', 'technology', 'video', 'users', 'mobile'],
 ['election', 'labour', 'party', 'leader', 'tory'],
 ['film', 'awards', 'award', 'actor', 'oscar'],
 ['law', 'court', 'ministers', 'secretary', 'lords'],
 ['growth', 'economy', 'prices', 'economic', 'rose'],
 ['game', 'cup', 'coach', 'match', 'injury'],
 ['firm', 'company', 'shares', 'financial', 'stock'],
 ['can', 'you', 'do', 'people', 'they'],
 ['irish', 'williams', 'fans', 'staff', 'rugby'],
 ['band', '000', 'show', 'chart', 'health']]

In [62]:
# looks like there are 5 clusters at a first glance
user_input = [
    ['film', 'tv', 'music', 'oscar'],
    ['growth', 'economy', 'stock', 'investor'],
    ['technology', 'web', 'software'],
    ['olympic', 'game', 'cup', 'coach', 'game', 'club'],
    ['law', 'government', 'election', 'blair'],
]

In [63]:
# run it with user input
x = interactive.icluster(data, terms, user_input, len(user_input), userU=+1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x



In [64]:
# seem to be getting better clusters
[x[:5] for x in clusterKeyterms]

[['film', 'awards', 'award', 'actor', 'actress'],
 ['growth', 'analysts', 'shares', 'market', 'oil'],
 ['users', 'technology', 'microsoft', 'online', 'digital'],
 ['cup', 'game', 'match', 'coach', 'side'],
 ['mr', 'labour', 'election', 'government', 'party']]

In [79]:
len(bbc)

2225

In [80]:
sum([len(x) for x in clusterDocs])

2491

In [65]:
[x[:5] for x in clusterDocs]

[[5, 10, 11, 27, 30],
 [12, 13, 16, 19, 20],
 [1, 18, 20, 21, 22],
 [3, 4, 8, 9, 15],
 [1, 2, 6, 7, 14]]

In [84]:
clusterDocs[-1][-1]

2224

In [82]:
for k in range(len(clusterDocs)):
    key = ','.join(clusterKeyterms[k][:5])
    sample_doc = bbc[clusterDocs[k][0]][:200]
    print(f'Cluster {k + 1}')
    print(f'Key terms  : {key}')
    print(f'Sample doc : {sample_doc}')
    print('=' * 10)

Cluster 1
Key terms  : film,awards,award,actor,actress
Sample doc : howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory leader is acting like an  attack mongrel  shows labour is  rattled  by the opposition.  in an upbeat speech
Cluster 2
Key terms  : growth,analysts,shares,market,oil
Sample doc : crude oil prices back above $50 cold weather across parts of the united states and much of europe has pushed us crude oil prices above $50 a barrel for the first time in almost three months.  freezing
Cluster 3
Key terms  : users,technology,microsoft,online,digital
Sample doc : worldcom boss  left books alone  former worldcom boss bernie ebbers  who is accused of overseeing an $11bn (£5.8bn) fraud  never made accounting decisions  a witness has told jurors.  david myers made
Cluster 4
Key terms  : cup,game,match,coach,side
Sample doc : yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier league leaders yeading 