In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import numpy as np
from scipy.cluster.vq import kmeans2
from scipy.spatial.distance import cdist

import datasets
import preprocessing

### Get data

In [5]:
# BBC corpus
bbc = datasets.get_bbc()

In [6]:
# bbc is a list of strings
type(bbc)

list

In [7]:
type(bbc[0])

str

In [8]:
len(bbc)

2225

In [9]:
# first 100 chars of the 1st doc
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [10]:
# pre-process and vectorize
processor = preprocessing.NLPProcessor('tf-idf')
bbc_vectorized = processor.fit_transform(bbc)

In [11]:
# sparse matrix
type(bbc_vectorized)

scipy.sparse.csr.csr_matrix

In [12]:
# 2,225 docs with 29,421 sparse features
bbc_vectorized.shape

(2225, 29421)

### Similarity metric

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# cosine similarity between 1st and 2nd doc
cosine_similarity(bbc_vectorized[0], bbc_vectorized[1])

array([[0.09372963]])

In [14]:
# cosine similarity of all 2,225 docs
# in a matrix of 2,225 x 2,225
cosine_similarity(bbc_vectorized)

array([[1.        , 0.09372963, 0.16603523, ..., 0.14290773, 0.23992781,
        0.18738048],
       [0.09372963, 1.        , 0.08616861, ..., 0.04780253, 0.09995931,
        0.08760382],
       [0.16603523, 0.08616861, 1.        , ..., 0.09247675, 0.17008746,
        0.1621905 ],
       ...,
       [0.14290773, 0.04780253, 0.09247675, ..., 1.        , 0.11614638,
        0.08908685],
       [0.23992781, 0.09995931, 0.17008746, ..., 0.11614638, 1.        ,
        0.15640773],
       [0.18738048, 0.08760382, 0.1621905 , ..., 0.08908685, 0.15640773,
        1.        ]])

In [None]:
# ...

### Clustering

In [182]:
from sklearn.cluster import KMeans

In [13]:
k = KMeans(n_clusters=5)

In [14]:
# this will take a while
# k.fit(bbc_vectorized)

In [16]:
# take only the top 1,000 most occurring tokens in the corpus 
p = preprocessing.NLPProcessor(max_features=1000)
bbc_vectorized_features_bound = p.fit_transform(bbc)

In [16]:
bbc_vectorized_features_bound.shape

(2225, 1000)

In [20]:
%%time
k.fit(bbc_vectorized_features_bound)

CPU times: user 1min 17s, sys: 641 ms, total: 1min 18s
Wall time: 26.3 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
k.predict(bbc_vectorized_features_bound)

array([2, 4, 0, ..., 2, 2, 0], dtype=int32)

In [94]:
# ...

### Interactive

In [17]:
import interactive

In [18]:
k = 10
options = (1.1, 25, 0.01, 0)
userU = -1

In [19]:
data = bbc_vectorized_features_bound.todense()
terms = np.array(p.vec.get_feature_names()).reshape((1, 1000))

In [20]:
x = interactive.icluster(data, terms, [], k, userU=-1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x



In [21]:
clusterKeyterms

[['sony',
  'film',
  'tv',
  'music',
  'chart',
  'original',
  'sold',
  'million',
  '000',
  'show',
  'films',
  'including',
  'release',
  'band',
  'christmas',
  'award',
  'us',
  'stars',
  'gaming',
  'children',
  'include',
  'series',
  'rock',
  'comedy',
  'awards',
  'shows',
  'entertainment',
  'movie',
  'which',
  'project',
  'version',
  'new',
  'selling',
  'screen',
  'according',
  'festival',
  'box',
  'across',
  'aid',
  'uk',
  'its',
  'prize',
  'success',
  'list',
  '100',
  'television',
  'song',
  'sale',
  'video',
  'production'],
 ['growth',
  'economy',
  'prices',
  'economic',
  'rose',
  'rise',
  'rate',
  '2004',
  'bank',
  'rates',
  'fall',
  'dollar',
  'fell',
  'figures',
  'demand',
  'strong',
  'oil',
  'quarter',
  '2005',
  'analysts',
  'higher',
  'sales',
  'market',
  'profits',
  'markets',
  'interest',
  'euro',
  'spending',
  'annual',
  'however',
  'consumer',
  'sector',
  'december',
  'expected',
  'trade',
  'c

In [22]:
clusterDocs

['["2223"]',
 '["2221"]',
 '["2218"]',
 '["2224"]',
 '["2225"]',
 '["2225"]',
 '["2202"]',
 '["2213"]',
 '["2177"]',
 '["2222"]']

In [23]:
bbc[2223 - 1][:100]

'rem announce new glasgow concert us band rem have announced plans to perform for 10 000 scottish fan'

In [24]:
bbc[2221 - 1][:200]

'cars pull down us retail figures us retail sales fell 0.3% in january  the biggest monthly decline since last august  driven down by a heavy fall in car sales.  the 3.3% fall in car sales had been exp'

In [25]:
bbc[2218 - 1][:200]

'rings of steel combat net attacks gambling is hugely popular  especially with tech-savvy criminals.  many extortionists are targeting net-based betting firms and threatening to cripple their websites '

#### Get user input

In [29]:
user_input = [x[:5] for x in clusterKeyterms]
user_input

[['sony', 'film', 'tv', 'music', 'chart'],
 ['growth', 'economy', 'prices', 'economic', 'rose'],
 ['users', 'technology', 'net', 'software', 'web'],
 ['you', 'do', 'can', 'what', 'them'],
 ['her', 'champion', 'won', 'olympic', 'australian'],
 ['side', 'coach', 'cup', 'game', 'club'],
 ['ministers', 'law', 'government', 'secretary', 'court'],
 ['film', 'oscar', 'actor', 'awards', 'hollywood'],
 ['shares', 'company', 'firm', 'stock', 'investors'],
 ['election', 'labour', 'party', 'blair', 'leader']]

In [36]:
user_input = [
    ['film', 'tv', 'music', 'oscar'],
    ['growth', 'economy', 'stock', 'investor'],
    ['technology', 'web', 'software'],
    ['olympic', 'game', 'cup', 'coach', 'game', 'club'],
    ['law', 'government', 'election', 'blair'],
]

In [37]:
x = interactive.icluster(data, terms, user_input, 5, userU=+1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x



In [38]:
[x[:5] for x in clusterKeyterms]

[['film', 'awards', 'award', 'actor', 'actress'],
 ['growth', 'shares', 'market', 'analysts', 'oil'],
 ['users', 'technology', 'digital', 'software', 'net'],
 ['cup', 'game', 'match', 'coach', 'injury'],
 ['labour', 'government', 'election', 'party', 'blair']]