# Text Mining of BBC News Data

## Part 4: Similarity and Visualization in Latent Space

In [None]:
from pathlib import Path
import numpy as np

text_filepaths = sorted(Path("bbc").glob("*/*.txt"))
categories = np.array([p.parent.name for p in text_filepaths])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(
    input="filename", encoding="utf-8", decode_error="ignore",
    min_df=5, max_df=0.8)

tfidf_docs = tfidf_vectorizer.fit_transform(text_filepaths)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50).fit(tfidf_docs)

In [None]:
svd.explained_variance_

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(svd.explained_variance_ratio_);

In [None]:
svd_docs = svd.transform(tfidf_docs)

In [None]:
plt.scatter(svd_docs[:, 0], svd_docs[:, 1]);

In [None]:
plt.scatter(svd_docs[:, 1], svd_docs[:, 2]);

In [None]:
np.unique(categories)

In [None]:
for c in np.unique(categories):
    mask = categories == c
    plt.scatter(svd_docs[mask, 0], svd_docs[mask, 1], label=c)
plt.legend();

In [None]:
for c in np.unique(categories):
    mask = categories == c
    plt.scatter(svd_docs[mask, 1], svd_docs[mask, 2], label=c)

plt.legend();

In [None]:
for c in np.unique(categories):
    mask = categories == c
    plt.scatter(svd_docs[mask, 2], svd_docs[mask, 3], label=c)

plt.legend();

## Visualization of KMeans Centers and NMF Topics in SVD and TSNE Space

In [None]:
from sklearn.cluster import KMeans

kmeans_predictions = KMeans(n_clusters=5).fit_predict(svd_docs)

In [None]:
for c in np.unique(kmeans_predictions):
    mask = kmeans_predictions == c
    plt.scatter(svd_docs[mask, 2], svd_docs[mask, 3], label=c)

plt.legend();

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=10).fit(tfidf_docs)
nmf_docs = nmf.transform(tfidf_docs)

nmf_docs /= nmf_docs.max()
rgba_colors = np.zeros((nmf_docs.shape[0], 4))
rgba_colors[:, 2] = 1.0

for topic in range(nmf_docs.shape[1]):
    plt.figure()
    rgba_colors[:, 3] = nmf_docs[:, topic] # alpha channel
    plt.scatter(svd_docs[:, 2], svd_docs[:, 3], color=rgba_colors)
    plt.title(f"Topic #{topic}")

In [None]:
from sklearn.manifold import TSNE

In [None]:
%%time
tsne_docs = TSNE(perplexity=30).fit_transform(svd_docs)

In [None]:
svd_docs.shape

In [None]:
tsne_docs.shape

In [None]:
for c in np.unique(categories):
    mask = categories == c
    plt.scatter(tsne_docs[mask, 0], tsne_docs[mask, 1], label=c)

plt.legend();

In [None]:
for topic in range(nmf_docs.shape[1]):
    plt.figure()
    rgba_colors[:, 3] = nmf_docs[:, topic] # alpha channel
    plt.scatter(tsne_docs[:, 0], tsne_docs[:, 1], color=rgba_colors)
    plt.title(f"Topic #{topic}")

## Similarities in Truncated SVD Space (Latent Semantic Indexing)


First let's compute similarities in the original TF-IDF high dimensional space as we did previously:

In [None]:
def sparse_dot_products(query_vector, other_vectors):
    dot_products = query_vector.multiply(other_vectors).sum(axis=1)
    return np.asarray(dot_products).ravel()

In [None]:
tfidf_first_doc = tfidf_docs[0, :]
tfidf_other_docs = tfidf_docs[1:, :]
tfidf_similarities = sparse_dot_products(tfidf_first_doc, tfidf_other_docs)
len(tfidf_similarities)

In [None]:
import pandas as pd

tfidf_most_similar = pd.DataFrame({
    "filepath": text_filepaths[1:],
    "tfidf_similarity": tfidf_similarities,
}).nlargest(15, "tfidf_similarity")

tfidf_most_similar

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

svd_first_doc = svd_docs[0:1, :]
svd_other_docs = svd_docs[1:, :]
svd_similarities = cosine_similarity(svd_first_doc, svd_other_docs).ravel()

svd_most_similar = pd.DataFrame({
    "filepath": text_filepaths[1:],
    "svd_similarity": svd_similarities,
}).nlargest(15, "svd_similarity")

svd_most_similar

As you can see the SVD space seems to preseve some of the neighboord structure of the original TF-IDF space.

Let's try to quantify this using K-NN clasification in a cross-validation loop:

In [None]:

categories = [path.parent.name for path in text_filepaths]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
%%time
cv_scores = cross_val_score(KNeighborsClassifier(n_neighbors=5, metric="cosine"),
                            tfidf_docs, categories, cv=10)
print(f"Val. accuracy: {cv_scores.mean():.3f} (+/-{cv_scores.std():.3f})")

In [None]:
%%time
cv_scores = cross_val_score(KNeighborsClassifier(n_neighbors=5, metric="cosine"),
                            svd_docs, categories, cv=10)
print(f"Val. accuracy: {cv_scores.mean():.3f} (+/-{cv_scores.std():.3f})")

Interestingly, using the 50 dimensional reduction dimensions of Truncated SVD both improves the computational speed and the generalization ability of the K-NN classification model.

Using a SVD to reduce the dimension of Bag-of-Words text data so as to do cosine similarity queries is called [Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis).