In [1]:
import datasets
import preprocessing

In [2]:
# BBC corpus
bbc = datasets.get_bbc()

In [3]:
# bbc is a list of strings
type(bbc)

list

In [4]:
type(bbc[0])

str

In [5]:
len(bbc)

2225

In [6]:
# first 100 chars of the 1st doc
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [7]:
# pre-process and vectorize
processor = preprocessing.NLPProcessor('tf-idf')
bbc_vectorized = processor.fit_transform(bbc)

In [8]:
# sparse matrix
type(bbc_vectorized)

scipy.sparse.csr.csr_matrix

In [9]:
# 2,225 docs with 29,421 sparse features
bbc_vectorized.shape

(2225, 29421)

### Similarity metric

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
# cosine similarity between 1st and 2nd doc
cosine_similarity(bbc_vectorized[0], bbc_vectorized[1])

array([[0.09372963]])

In [14]:
# cosine similarity of all 2,225 docs
# in a matrix of 2,225 x 2,225
cosine_similarity(bbc_vectorized)

array([[1.        , 0.09372963, 0.16603523, ..., 0.14290773, 0.23992781,
        0.18738048],
       [0.09372963, 1.        , 0.08616861, ..., 0.04780253, 0.09995931,
        0.08760382],
       [0.16603523, 0.08616861, 1.        , ..., 0.09247675, 0.17008746,
        0.1621905 ],
       ...,
       [0.14290773, 0.04780253, 0.09247675, ..., 1.        , 0.11614638,
        0.08908685],
       [0.23992781, 0.09995931, 0.17008746, ..., 0.11614638, 1.        ,
        0.15640773],
       [0.18738048, 0.08760382, 0.1621905 , ..., 0.08908685, 0.15640773,
        1.        ]])

In [17]:
# ...

### Clustering

In [18]:
from sklearn.cluster import KMeans

In [19]:
k = KMeans(n_clusters=5)

In [20]:
# this will take a while
# k.fit(bbc_vectorized)

In [21]:
# take only the top 1,000 most occurring tokens in the corpus 
bbc_vectorized_features_bound = preprocessing.NLPProcessor(max_features=1000).fit_transform(bbc)

In [22]:
bbc_vectorized_features_bound.shape

(2225, 1000)

In [23]:
%%time
k.fit(bbc_vectorized_features_bound)

Wall time: 49.8 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [24]:
k.predict(bbc_vectorized_features_bound)

array([3, 0, 2, ..., 4, 3, 2])

In [25]:
# ...

### Topic Modeling

In [26]:
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
lda = LatentDirichletAllocation(learning_method="online")

In [28]:
lda.fit(bbc_vectorized_features_bound)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [31]:
lda.transform(bbc_vectorized_features_bound)

array([[0.00897843, 0.00897843, 0.00897843, ..., 0.00897843, 0.00897843,
        0.00897843],
       [0.0113938 , 0.0113938 , 0.0113938 , ..., 0.0113938 , 0.0113938 ,
        0.0113938 ],
       [0.01030121, 0.01030121, 0.01030121, ..., 0.01030121, 0.01030121,
        0.01030121],
       ...,
       [0.01114821, 0.01114821, 0.01114821, ..., 0.01114821, 0.01114821,
        0.01114821],
       [0.00934198, 0.00934198, 0.00934198, ..., 0.00934198, 0.00934198,
        0.00934198],
       [0.01092845, 0.01092845, 0.01092845, ..., 0.01092845, 0.01092845,
        0.01092845]])