In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

## Wikipedia articles Dataset

In [40]:
articles = pd.read_json('../datasets/wikipedia.json')
articles.head()

Unnamed: 0,article,title
0,A ship with supplies for Gaza will dock at el-...,Gaza aid ship to dock in Egypt after Israel pr...
1,Often acts and directs stories involving an in...,Mel Gibson
2,Cast member Mel Gibson (R) and Oksana Grigorie...,Talent Agency WME drops Mel Gibson
3,"(Adds details) TEHRAN, June 20 (Reuters) - A ...",Suicide bomber killed in Tehran-Fars
4,Tehran - Iran's Guardian Council is ready to r...,Iran's 10% ballot boxes to be recounted


In [41]:
documents = articles['article'].values
titles = articles['title'].values

## tf-idf word-frequency array

In [42]:
# Create a TfidfVectorizer
tfidf = TfidfVectorizer() 

In [43]:
# Apply fit_transform to document
csr_mat = tfidf.fit_transform(documents)
csr_mat.shape

(21, 1848)

In [44]:
# Print result of toarray() method
print(csr_mat.toarray())

[[ 0.0546245   0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.15194684  0.0840541   0.         ...,  0.          0.          0.        ]]


In [45]:
# Get the words
words = tfidf.get_feature_names()
len(words)

1848

## Clustering Wikipedia

In [46]:
# Create a TruncatedSVD instance
svd = TruncatedSVD(n_components=50)

In [47]:
# Create a KMeans instance
kmeans = KMeans(n_clusters=6)

In [48]:
# Create a pipeline
pipeline = make_pipeline(svd, kmeans)

In [49]:
# Fit the pipeline to articles
pipeline.fit(csr_mat)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0))])

In [50]:
# Calculate the cluster labels
labels = pipeline.predict(csr_mat)

In [51]:
# Create a DataFrame aligning labels and titles
df = pd.DataFrame({'label': labels, 'article': titles})

In [52]:
# Display df sorted by cluster label
print(df.sort_values('label'))

                                              article  label
12  Jimmy Savile: Gary Glitter arrested over sex o...      0
9   Jimmy Savile nephew 'devastated' by sex abuse ...      0
0   Gaza aid ship to dock in Egypt after Israel pr...      1
11  I tried repeatedly to talk the US out of invad...      1
19                       PM Koizumi hangs on to power      1
6   Relatives of disgraced Savile voice their anguish      1
18  Pakistan v Sri Lanka, ICC World Twenty20 final...      1
2                  Talent Agency WME drops Mel Gibson      1
1                                          Mel Gibson      1
16  Al Qaeda leader calls for kidnapping of Wester...      1
13  Source: Jon Huntsman to drop out of presidenti...      2
14      Republican primaries: Jon Huntsman to end bid      2
7          Iran to recount 10 percent of ballot boxes      3
4             Iran's 10% ballot boxes to be recounted      3
5   Iran's Mousavi renews call for cancellation of...      3
3                Suicide