# Text Feature Extraction

In [None]:
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

import sys

sys.path.append('../')

alt.renderers.enable('default')

In [None]:
from sklearn import decomposition

def plot_proj(X, y):
    reducer = decomposition.TruncatedSVD(n_components=2)
    X_proj = reducer.fit_transform(X) if X.shape[1] > 2 else X

    chart = alt.Chart(pd.DataFrame({
        'dim-1': X_proj[:, 0],
        'dim-2': X_proj[:, 1],
        'label': y,
    })).mark_point().encode(
        x='dim-1:Q',
        y='dim-2:Q',
        color='label:N',
    )
    return chart

In [None]:
import json
from sklearn.datasets import fetch_20newsgroups

X, y = fetch_20newsgroups(subset='all', remove=['headers', 'footers', 'quotes'], return_X_y=True)
#X, y = fetch_20newsgroups(subset='all', remove=['headers', 'footers', 'quotes'], return_X_y=True, categories=['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'])
#X, y = fetch_20newsgroups(subset='all', return_X_y=True)

# filter data objects with empty body
indices = [i for i, x in enumerate(X) if len(x) == 0]
X = [x for i, x in enumerate(X) if i not in indices]
y = [y for i, y in enumerate(y) if i not in indices]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=5000,
                                   stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(X)
nmf = NMF(n_components=20, init='random', random_state=0, alpha=.1, l1_ratio=.5)
X_nmf = nmf.fit_transform(X_tfidf)

print(X_nmf.shape)

plot_proj(X_nmf[:5000], y[:5000]).interactive()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
X_tf = tf_vectorizer.fit_transform(X)
lda = LatentDirichletAllocation(n_components=50, learning_method='online', max_iter=5, random_state=0)
X_lda = lda.fit_transform(X_tf)

print(X_lda.shape)

plot_proj(X_lda[:5000], y[:5000]).interactive()