<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Latent Semantic Analysis (LSA)</H1></u></center>

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans

In [None]:
categories = ['rec.autos', 'sci.electronics', 'comp.graphics']

In [None]:
data = fetch_20newsgroups(subset='all', categories=categories,
                          shuffle=True, random_state=2017)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data.data)

In [None]:
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1)
svd = TruncatedSVD(2000) #Desired dimensionality of output data
lsa = make_pipeline(svd, Normalizer(copy=False))

In [None]:
X = lsa.fit_transform(X)
km.fit(X)

In [None]:
explained_variance = svd.explained_variance_ratio_.sum()
explained_variance

In [None]:
terms = vectorizer.get_feature_names()
original_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_centroids.argsort()[:,::-1]

In [None]:
for i in range(3):
    print("Cluster %d:" % i)
    for ix in order_centroids[i, :10]:
        print(' %s' % terms[ix])

In [None]:
kmini = MiniBatchKMeans(n_clusters=3, init='k-means++', n_init=1,
                        init_size=1000,
                       batch_size=1000
                       )

In [None]:
kmini_model = kmini.fit(X)

In [None]:
original_centroids = svd.inverse_transform(kmini_model.cluster_centers_)
order_centroids = original_centroids.argsort()[:,::-1]

In [None]:
for i in range(3):
    print("Cluster %d:" % i)
    for ix in order_centroids[i, :10]:
        print(' %s' % terms[ix])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
centroids = km.cluster_centers_
f, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:,0], X[:,1],cmap=plt.cm.brg, c=km.labels_)
ax.scatter(centroids[:,0], centroids[:,1], marker='x', s=300, linewidths=5, c='black')

## References:

http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD

http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer
