## Text Mining

In [21]:
import numpy as np
import pandas as pd
import string
import collections

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.utils import check_random_state
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

import nltk
from nltk.corpus import brown
from nltk.stem.snowball import EnglishStemmer

from nose.tools import (
    assert_equal,
    assert_is_instance,
    assert_almost_equal,
    assert_true
)
from numpy.testing import assert_array_equal

### Data Preprocessing

In [276]:
def select_genres(n):
    '''
    Selects genres with more than n files. Returns raw data and the genre of each file
    in the selected genres as two 1d numpy arrays.
    
    Parameters
    ----------
    n: An integer.
    
    Returns
    -------
    A tuple of (raw, genres)
    raw: A 1d numpy array.
    genres: A 1d numpy array.
    '''
    
    fl=[]
    for genere in brown.categories():
        length=len(brown.fileids(categories=genere))
        if length>n:
            fl.append(brown.fileids([genere]))
    fid=sum(fl, [])
    fid.sort()
    raw=np.array([brown.raw(i) for i in fid])
    genres=np.array(sum([brown.categories(j) for j in fid],[]))
    return raw, genres

In [278]:
t1_raw, t1_genres = select_genres(70)
assert_equal(np.shape(t1_raw), (155,))
assert_equal(np.shape(t1_genres), (155,))
assert_array_equal(t1_genres, ['belles_lettres']*75+['learned']*80)
assert_equal(t1_raw[5][:50], 'Die/fw-at Frist/fw-nn ist/fw-bez um/fw-rb ,/, und/')
assert_equal(t1_raw[120][120:160], 'agricultural/jj areas/nns in/in the/at w')


In [279]:
t1_raw, t1_genres = select_genres(70)
assert_equal(np.shape(t1_raw), (155,))
assert_equal(np.shape(t1_genres), (155,))
assert_array_equal(t1_genres, ['belles_lettres']*75+['learned']*80)
assert_equal(t1_raw[5][:50], 'Die/fw-at Frist/fw-nn ist/fw-bez um/fw-rb ,/, und/')
assert_equal(t1_raw[120][120:160], 'agricultural/jj areas/nns in/in the/at w')

t2_raw, t2_genres = select_genres(29)
assert_equal(np.shape(t2_raw), (313,))
assert_equal(np.shape(t2_genres), (313,))
assert_array_equal(t2_genres, ['news']*44+['hobbies']*36+['lore']*48+['belles_lettres']*75+['government']*30+['learned']*80)
assert_equal(t2_raw[300][-80:], " is/bez not/* generally/rb used/vbn over-hand/rb ,/, but/cc under/rb ''/'' ./.\n\n")
assert_equal(t2_raw[249][490:530], 's from/in the/at cortex/nn to/in the/at ')

### Training and Testing Sets

In [280]:
t_raw, t_genres = select_genres(27)
t_X_train, t_X_test, t_y_train, t_y_test = train_test_split(t_raw, 
                                                            t_genres, 
                                                            random_state=check_random_state(0), 
                                                            test_size=0.3)

### n-grams

In [281]:
def ngram(X_train, y_train, X_test):
    '''
    Creates a document term matrix and uses KNC classifier to make document classifications.
    Uses unigrams, bigrams, and trigrams.
    
    Parameters
    ----------
    X_train: A 1d numpy array of strings.
    y_train: A 1d numpy array of strings.
    X_test: A 1d numpy array of strings.
    
    Returns
    -------
    A tuple of (clf, y_pred)
    clf: A Pipeline instance.
    y_pred: A 1d numpy array.
    '''
    
    tools = [('tf', TfidfVectorizer()), ('knc', KNeighborsClassifier())]
    clf = Pipeline(tools)
    clf.set_params(tf__stop_words = 'english', \
                tf__ngram_range=(1,3), \
                tf__lowercase=True,\
                tf__min_df=3, \
                tf__max_df=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return clf, y_pred

In [282]:
clf1, y_pred1 = ngram(t_X_train, t_y_train, t_X_test)
score1 = accuracy_score(y_pred1, t_y_test)
print("KNC prediction accuracy = {0:5.1f}%".format(100.0 * score1))

KNC prediction accuracy =  52.5%


In [283]:
assert_is_instance(clf1, Pipeline)
assert_is_instance(y_pred1, np.ndarray)
tf1 = clf1.named_steps['tf']
assert_is_instance(tf1, TfidfVectorizer)
assert_is_instance(clf1.named_steps['knc'], KNeighborsClassifier)
assert_equal(tf1.stop_words, 'english')
assert_equal(tf1.ngram_range, (1, 3))
assert_equal(tf1.min_df, 3)
assert_equal(tf1.max_df, 0.7)
assert_equal(len(y_pred1), len(t_y_test))
assert_array_equal(y_pred1[:5], ['belles_lettres', 'government', 'romance', 'belles_lettres', 'government'])
assert_array_equal(y_pred1[-5:], ['government', 'lore', 'government', 'learned', 'adventure'])
assert_almost_equal(score1, 0.52500000000000002)

### Stemming

In [287]:
def tokenize(text):
    '''
    Converts text into tokens. Same function as in the "introduction to text mining" notebook.
    Uses Snowball Stemmer.
    
    Parameters
    ----------
    text: a string.
    
    Returns
    -------
    tokens: a map object.
    '''
    
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in string.punctuation]

    stemmer = EnglishStemmer()
    stems = map(stemmer.stem, tokens)
    return stems

In [288]:
def stem(X_train, y_train, X_test):
    '''
    Creates a document term matrix and uses KNC classifier to make document classifications.
    Uses the Snowball stemmer.
    
    Parameters
    ----------
    X_train: A 1d numpy array of strings.
    y_train: A 1d numpy array of strings.
    X_test: A 1d numpy array of strings.
    
    Returns
    -------
    A tuple of (clf, y_pred)
    clf: A Pipeline instance.
    y_pred: A 1d numpy array.
    '''
    
    tools = [('tf', TfidfVectorizer()), ('knc', KNeighborsClassifier())]
    clf = Pipeline(tools)
    clf.set_params(tf__stop_words = 'english', \
                tf__ngram_range=(1,3), \
                tf__lowercase=True,\
                tf__min_df=3, \
                tf__max_df=0.7,\
                tf__tokenizer=tokenize)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return clf, y_pred

In [289]:
clf2, y_pred2 = stem(t_X_train[:100], t_y_train[:100], t_X_test[:50])
score2 = accuracy_score(y_pred2, t_y_test[:50])
print("KNC prediction accuracy = {0:5.1f}%".format(100.0 * score2))

KNC prediction accuracy =  42.0%


In [290]:
assert_is_instance(clf2, Pipeline)
assert_is_instance(y_pred2, np.ndarray)
tf2 = clf2.named_steps['tf']
assert_is_instance(tf2, TfidfVectorizer)
assert_is_instance(clf2.named_steps['knc'], KNeighborsClassifier)
assert_equal(tf2.stop_words, 'english')
assert_equal(tf2.ngram_range, (1, 3))
assert_equal(tf2.min_df, 3)
assert_equal(tf2.max_df, 0.7)

assert_equal(len(y_pred2), 50)
assert_array_equal(y_pred2[:5], ['lore', 'learned', 'romance', 'belles_lettres', 'learned'])
assert_array_equal(y_pred2[-5:], ['fiction', 'romance', 'belles_lettres', 'romance', 'learned'])
assert_almost_equal(score2, 0.41999999999999998 )

### Clustering Analysis

In [318]:
def get_top_tokens(X_train, y_train, X_test, random_state, k, n):
    '''
    First, applies clustering analysis to a feature matrix.
    Then, identifies the most frequently used words in "icluster".
    
    Parameters
    ----------
    X_train: A 1d numpy array of strings.
    y_train: A 1d numpy array of strings.
    X_test: A 1d numpy array of strings.
    random_state: A np.random.RandomState instance for KMeans.
    k: An int. The number of clusters.
    n: An int. Specifies how many tokens for each cluster should be returned.
    
    Returns
    -------
    clf: A Pipeline instance.
    tokens: A 2d numpy array of strings with shape of (n_clusters, n_tokens of each cluster)
    '''
    
    tools = [('tf', TfidfVectorizer()), ('km', KMeans(n_clusters=k,random_state=random_state))]
    clf = Pipeline(tools)
    clf.set_params(tf__stop_words = 'english', \
                tf__ngram_range=(1,1), \
                tf__lowercase=True,\
                tf__min_df=3, \
                tf__max_df=0.7)
    clf.fit(X_train)
    order_centroids = clf.named_steps['km'].cluster_centers_.argsort()[:, ::-1]
    terms = clf.named_steps['tf'].get_feature_names()
    atoken=[]
    for idx in range(k):
        for jdx in order_centroids[idx, :n]:
            atk=terms[jdx]
            atoken.append(atk)
    tokens=np.reshape(atoken, (k, n))
    return clf, tokens

In [325]:
k3 = len(np.unique(t_genres))
n3 = 5
clf3, tokens3 = get_top_tokens(t_X_train, t_y_train, t_X_test, check_random_state(0), k3, n3)
print('Top {} tokens per cluster:'.format(n3))
print('-'*45)
for i in range(k3):
    print("Cluster {0}: {1}".format(i, ' '.join(tokens3[i])))

Top 5 tokens per cluster:
---------------------------------------------
Cluster 0: fw nil bridge pont nps
Cluster 1: men man said eyes dod
Cluster 2: hl costs shelter foam foods
Cluster 3: college mrs students school education
Cluster 4: said dod uh ll bem
Cluster 5: hl nps state president law
Cluster 6: hl year tax sales 1960
Cluster 7: af hl temperature pressure fig
Cluster 8: nc fw man human experience


In [326]:
assert_is_instance(clf3, Pipeline)
tf3 = clf3.named_steps['tf']
assert_is_instance(tf3, TfidfVectorizer)
km3 = clf3.named_steps['km']
assert_is_instance(km3, KMeans)
assert_equal(tf3.stop_words, 'english')
assert_equal(tf3.ngram_range, (1, 1))
assert_equal(tf3.min_df, 3)
assert_equal(tf3.max_df, 0.7)
assert_equal(km3.n_clusters, k3)
assert_equal(np.shape(tokens3), (9, 5))
assert_array_equal(tokens3, [['fw', 'nil', 'bridge', 'pont', 'nps'],
                             ['men', 'man', 'said', 'eyes', 'dod'],
                             ['hl', 'costs', 'shelter', 'foam', 'foods'],
                             ['college', 'mrs', 'students', 'school', 'education'],
                             ['said', 'dod', 'uh', 'll', 'bem'],
                             ['hl', 'nps', 'state', 'president', 'law'],
                             ['hl', 'year', 'tax', 'sales', '1960'],
                             ['af', 'hl', 'temperature', 'pressure', 'fig'],
                             ['nc', 'fw', 'man', 'human', 'experience']])

In [327]:
clf4, tokens4 = get_top_tokens(t_X_train, t_y_train, t_X_test, check_random_state(0), k3, 3)
assert_array_equal(tokens4[0], ['fw', 'nil', 'bridge'])
assert_array_equal(tokens4[6], ['hl', 'year', 'tax'])