In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

## Wikipedia articles Dataset

In [33]:
articles = pd.read_json('../datasets/wikipedia/News-article-wikipedia.json', lines=True)
articles.head()

Unnamed: 0,_unit_id,article,newdescp
0,691201838,Gaza aid ship to dock in Egypt after Israel pr...,A ship with supplies for Gaza will dock at el...
1,691201839,Mel Gibson,Often acts and directs stories involving an i...
2,691201840,Talent Agency WME drops Mel Gibson,Cast member Mel Gibson (R) and Oksana Grigori...
3,691201841,Suicide bomber killed in Tehran-Fars,"(Adds details) TEHRAN, June 20 (Reuters) - A..."
4,691201842,Iran's 10% ballot boxes to be recounted,Tehran - Iran's Guardian Council is ready to ...


In [47]:
titles = articles['article'].values
documents = articles['newdescp'].values

## tf-idf word-frequency array

In [48]:
# Create a TfidfVectorizer
tfidf = TfidfVectorizer() 

In [49]:
# Apply fit_transform to document
# csr = Compressed Sparse Row
csr_documents = tfidf.fit_transform(documents)
csr_documents.shape

(3000, 31726)

In [50]:
# Print result of toarray() method
print(csr_documents.toarray())

[[ 0.          0.03522439  0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


In [51]:
# Get the words
words = tfidf.get_feature_names()
len(words)

31726

## Clustering Wikipedia

In [52]:
# Create a TruncatedSVD instance
svd = TruncatedSVD(n_components=50)

In [53]:
svd_documents = svd.fit_transform(csr_documents)

In [54]:
svd_documents.shape

(3000, 50)

In [55]:
# Create a KMeans instance
kmeans = KMeans(n_clusters=6)

In [56]:
# Fit the pipeline to articles
kmeans.fit(svd_documents)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [57]:
# Calculate the cluster labels
labels = kmeans.predict(svd_documents)

In [58]:
# Create a DataFrame aligning labels and titles
df = pd.DataFrame({'label': labels, 'article': titles})

In [60]:
# Display df sorted by cluster label
df.sort_values('label')

Unnamed: 0,article,label
1499,BP boss hands over oil spill management to Ame...,0
1843,Opposition backers shot in Southern Sudan prot...,0
1849,Malaysia soldiers attack armed Filipino clan i...,0
1850,Galloway defends himself at US Senate,0
813,Nigeria postpones parliamentary election,0
1851,Refugees flee as second city braces in fear of...,0
1853,Bomb was designed to explode on cargo plane,0
1842,Eurosceptic 'earthquake' rocks EU elections,0
1855,Russia's Vladimir Putin 'to respect' Ukraine vote,0
1857,Ireland and the Czech Republic vote in Europea...,0
