## Topic Modeling

In [2]:
import numpy as np
import pandas as pd
import json

from scipy.sparse.csr import csr_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import check_random_state
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from gensim.matutils import Sparse2Corpus
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

import warnings
warnings.filterwarnings("ignore") 

from nose.tools import assert_equal, assert_is_instance, assert_true, assert_almost_equal
from numpy.testing import assert_array_equal, assert_array_almost_equal

In [3]:
from sklearn.datasets import fetch_20newsgroups

train = fetch_20newsgroups(
    data_home='textdm', 
    subset='train',
    shuffle=True,
    random_state=check_random_state(0),
    remove=('headers', 'footers', 'quotes')
    )

test = fetch_20newsgroups(
    data_home='textdm', 
    subset='test',
    shuffle=True,
    random_state=check_random_state(0),
    remove=('headers', 'footers', 'quotes')
    )

### Document term matrix

In [5]:
def get_document_term_matrix(train_data, test_data):
    '''
    Uses TfidfVectorizer to create a document term matrix for "X_train" and "X_test".
    
    Paramters
    ---------
    train_data: A list of strings
    test_data:A list of strings
    
    Returns
    -------
    A 3-tuple of (model, train_matrix, test_matrix).
    model: A TfidfVectorizer instance
    train_matrix: A scipy.csr_matrix
    test_matrix: A scipy.csr_matrix
    '''
    
    cv = TfidfVectorizer(stop_words = 'english',
                     ngram_range=(1,2),
                     lowercase=True,
                     min_df=2,
                     max_features=20000)
    cv.fit(train_data)
    train_matrix= cv.transform(train_data)
    test_matrix = cv.transform(test_data)
    return cv, train_matrix, test_matrix

In [6]:
cv, train_data, test_data = get_document_term_matrix(train['data'], test['data'])

In [7]:
assert_is_instance(cv, TfidfVectorizer)
assert_is_instance(train_data, csr_matrix)
assert_is_instance(test_data, csr_matrix)
assert_equal(cv.stop_words, 'english')
assert_equal(cv.ngram_range, (1, 2))
assert_equal(cv.min_df, 2)
assert_equal(cv.max_features, 20000)

### Non-negative matrix factorization

In [16]:
def apply_nmf(data, random_state):
    '''
    Applies non-negative matrix factorization (NMF) to compute topics.
    
    Parameters
    ----------
    data: A csr_matrix
    random_state: A RandomState instance for NMF
    
    Returns
    -------
    A tuple of (nmf, transformed_data)
    nmf: An sklearn.NMF instance
    transformed_data: A numpy.ndarray
    '''
    
    nmf = NMF(60, random_state=random_state).fit(data)
    td = nmf.transform(data)
    td_norm = normalize(td, norm='l1', axis=1)
    return nmf, td_norm

In [17]:
nmf, td_norm = apply_nmf(train_data, random_state=check_random_state(0))

In [18]:
df = pd.DataFrame(td_norm)
df.fillna(value=0, inplace=True)
df['label'] = pd.Series(train['target'])

df_label = df.groupby('label').mean()
df_label['names'] = pd.Series(train['target_names'], dtype="category")

print(df_label.ix[:, -5:])

             56        57        58        59                     names
label                                                                  
0      0.005508  0.003736  0.032166  0.232558               alt.atheism
1      0.033090  0.006448  0.023974  0.025799             comp.graphics
2      0.022950  0.036914  0.015799  0.013004   comp.os.ms-windows.misc
3      0.017276  0.036642  0.013918  0.012025  comp.sys.ibm.pc.hardware
4      0.012614  0.031106  0.013396  0.016364     comp.sys.mac.hardware
5      0.021772  0.025961  0.022872  0.016832            comp.windows.x
6      0.005221  0.012886  0.009409  0.007000              misc.forsale
7      0.010737  0.007605  0.021220  0.030602                 rec.autos
8      0.009730  0.007368  0.019422  0.028687           rec.motorcycles
9      0.011176  0.007581  0.016163  0.034431        rec.sport.baseball
10     0.012147  0.002637  0.014880  0.025542          rec.sport.hockey
11     0.008274  0.003525  0.019598  0.039459                 sc

In [19]:
assert_is_instance(nmf, NMF)
assert_is_instance(td_norm, np.ndarray)
assert_equal(nmf.n_components, 60)
assert_equal(nmf.max_iter, 200)
assert_equal(td_norm.shape, (11314, 60))

### Topic-based Classification

In [20]:
def classify_topics(nmf, X_train, y_train, X_test, random_state):
    '''
    
    Paramters
    ---------
    nmf: An sklearn.NMF model.
    X_train: A numpy array.
    y_train: A numpy array.
    X_test: A scipy csr_matrix.
    random_state: A RandomState instance for LinearSVC Classifier.
    
    Returns
    -------
    A tuple of (clf, y_pred)
    clf: A LinearSVC instance.
    y_pred: A numpy array.
    '''
    
    clf = RandomForestClassifier(random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(nmf.transform(X_test))
    return clf, y_pred    

In [21]:
clf, ts_preds = classify_topics(
    nmf, nmf.transform(train_data), train['target'], test_data, check_random_state(0)
    )
print(classification_report(test['target'], ts_preds, target_names=test['target_names']))

                          precision    recall  f1-score   support

             alt.atheism       0.25      0.31      0.28       319
           comp.graphics       0.35      0.45      0.39       389
 comp.os.ms-windows.misc       0.46      0.58      0.52       394
comp.sys.ibm.pc.hardware       0.41      0.40      0.40       392
   comp.sys.mac.hardware       0.43      0.46      0.45       385
          comp.windows.x       0.57      0.52      0.54       395
            misc.forsale       0.70      0.70      0.70       390
               rec.autos       0.41      0.67      0.51       396
         rec.motorcycles       0.70      0.59      0.64       398
      rec.sport.baseball       0.59      0.59      0.59       397
        rec.sport.hockey       0.68      0.65      0.66       399
               sci.crypt       0.67      0.56      0.61       396
         sci.electronics       0.38      0.26      0.31       393
                 sci.med       0.60      0.61      0.61       396
         

In [22]:
assert_is_instance(clf, RandomForestClassifier)
assert_is_instance(ts_preds, np.ndarray)
assert_equal(len(ts_preds), len(test['target']))

### Topic Modeling with Gensim

In [23]:
def get_topics(cv, train_data):
    '''
    Uses gensim to perform topic modeling.
    
    Paramters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.
    
    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    '''
    
    td_gensim = Sparse2Corpus(train_data,documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)
    lda_gs = LdaModel(corpus=td_gensim, id2word=dct,num_topics=20)
    ttps = lda_gs.top_topics(corpus=td_gensim, num_words=5)
    return ttps

In [24]:
topics = get_topics(cv, train_data)

In [26]:
for idx, (lst, val) in enumerate(topics):
    print('Topic {0}'.format(idx))
    print(35*('-'))
    for i, z in lst:
        print('    {0:20s}: {1:5.4f}'.format(z, i))
    print(35*('-'))

Topic 0
-----------------------------------
    game                : 0.0088
    team                : 0.0061
    games               : 0.0049
    season              : 0.0048
    year                : 0.0045
-----------------------------------
Topic 1
-----------------------------------
    people              : 0.0051
    don                 : 0.0044
    think               : 0.0040
    just                : 0.0039
    god                 : 0.0036
-----------------------------------
Topic 2
-----------------------------------
    controller          : 0.0051
    port                : 0.0049
    ide                 : 0.0043
    serial              : 0.0040
    card                : 0.0038
-----------------------------------
Topic 3
-----------------------------------
    apple               : 0.0035
    just                : 0.0032
    drive               : 0.0031
    problem             : 0.0029
    mac                 : 0.0027
-----------------------------------
Topic 4
------------

In [27]:
assert_is_instance(topics, list)
assert_equal(len(topics), 20)

for topic, score in topics:
    assert_is_instance(topic, list)
    assert_is_instance(score, float)
    assert_equal(len(topic), 5)
    for v, k in topic:
        assert_is_instance(k, str)
        assert_is_instance(v, float)