In [69]:
import datasets
import preprocessing

In [76]:
# BBC corpus
bbc = datasets.get_bbc()

In [77]:
# bbc is a list of strings
type(bbc)

list

In [78]:
type(bbc[0])

str

In [59]:
len(bbc)

2225

In [63]:
# first 100 chars of the 1st doc
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [73]:
# pre-process and vectorize
processor = preprocessing.NLPProcessor('tf-idf')
bbc_vectorized = processor.fit_transform(bbc)

In [75]:
# sparse matrix
type(bbc_vectorized)

scipy.sparse.csr.csr_matrix

In [74]:
# 2,225 docs with 29,421 sparse features
bbc_vectorized.shape

(2225, 29421)

### Similarity metric

In [70]:
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
# cosine similarity between 1st and 2nd doc
cosine_similarity(vectorized[0], vectorized[1])

array([[0.09372963]])

In [81]:
# cosine similarity of all 2,225 docs
# in a matrix of 2,225 x 2,225
cosine_similarity(vectorized)

array([[1.        , 0.09372963, 0.16603523, ..., 0.14290773, 0.23992781,
        0.18738048],
       [0.09372963, 1.        , 0.08616861, ..., 0.04780253, 0.09995931,
        0.08760382],
       [0.16603523, 0.08616861, 1.        , ..., 0.09247675, 0.17008746,
        0.1621905 ],
       ...,
       [0.14290773, 0.04780253, 0.09247675, ..., 1.        , 0.11614638,
        0.08908685],
       [0.23992781, 0.09995931, 0.17008746, ..., 0.11614638, 1.        ,
        0.15640773],
       [0.18738048, 0.08760382, 0.1621905 , ..., 0.08908685, 0.15640773,
        1.        ]])

In [None]:
# ...

### Clustering

In [82]:
from sklearn.cluster import KMeans

In [87]:
k = KMeans(n_clusters=5)

In [89]:
# this will take a while
# k.fit(bbc_vectorized)

In [90]:
# take only the top 1,000 most occurring tokens in the corpus 
bbc_vectorized_features_bound = preprocessing.NLPProcessor(max_features=1000).fit_transform(bbc)

In [91]:
bbc_vectorized_features_bound.shape

(2225, 1000)

In [92]:
%%time
k.fit(bbc_vectorized_features_bound)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [93]:
k.predict(bbc_vectorized_features_bound)

array([3, 0, 1, ..., 4, 3, 1], dtype=int32)

In [94]:
# ...