# A Brief Tutorial on Text Processing Using NLTK and Scikit-Learn
###################################################################################################

# Tokenization in NLTK

In [1]:
import os
os.getcwd()

'E:\\CPEEB25\\CSE7306c\\20170325_Batch25_Day2_TextMining'

In [5]:
os.chdir("E:\\CPEEB25\\CSE7306c\\20170325_Batch25_Day2_TextMining\\data")
os.getcwd()

'E:\\CPEEB25\\CSE7306c\\20170325_Batch25_Day2_TextMining\\data'

In [6]:
os.listdir(os.getcwd())

['shakespeare-macbeth.txt', 'sms.tsv']

In [8]:
import nltk
import string

from collections import Counter

def get_tokens():
    with open('anb-jarena-lee.txt', 'r') as shakes:
        text = shakes.read()
        lowers = text.lower()
        #remove the punctuation using the character deletion step of translate
        no_punctuation = lowers.translate(None, string.punctuation)
        tokens = nltk.word_tokenize(no_punctuation)
        return tokens

tokens = get_tokens()
count = Counter(tokens)
print count.most_common(100)

[('of', 9), ('and', 8), ('in', 5), ('she', 4), ('the', 4), ('lee', 3), ('to', 2), ('new', 2), ('congregations', 2), ('jersey', 2), ('church', 2), ('an', 2), ('methodists', 2), ('founder', 1), ('domestic', 1), ('four', 1), ('meditation', 1), ('children', 1), ('had', 1), ('vigor', 1), ('africanamerican', 1), ('they', 1), ('feelings', 1), ('profound', 1), ('married', 1), ('joined', 1), ('spiritual', 1), ('infancy', 1), ('energy', 1), ('experienced', 1), ('depression', 1), ('richard', 1), ('worshiped', 1), ('demons', 1), ('reverend', 1), ('various', 1), ('hill', 1), ('who', 1), ('eternal', 1), ('emotional', 1), ('by', 1), ('extreme', 1), ('on', 1), ('baptism', 1), ('inspired', 1), ('months', 1), ('prior', 1), ('roman', 1), ('mixed', 1), ('among', 1), ('hearing', 1), ('visions', 1), ('sermon', 1), ('pastored', 1), ('from', 1), ('her', 1), ('there', 1), ('protracted', 1), ('philadelphia', 1), ('whom', 1), ('joseph', 1), ('terrifying', 1), ('white', 1), ('was', 1), ('catholics', 1), ('methodi

# Stop Word Removal
These are uninformative, so let's remove the stop words.

In [9]:
from nltk.corpus import stopwords

tokens = get_tokens()
stop = stopwords.words("english")
token_withOutStop = [token for token in tokens if token not in stop]
count = Counter(token_withOutStop)
print count.most_common(100)

[('lee', 3), ('church', 2), ('methodists', 2), ('new', 2), ('congregations', 2), ('jersey', 2), ('among', 1), ('spiritual', 1), ('married', 1), ('founder', 1), ('infancy', 1), ('roman', 1), ('energy', 1), ('prayer', 1), ('domestic', 1), ('hearing', 1), ('four', 1), ('episcopal', 1), ('fever', 1), ('periods', 1), ('allen', 1), ('meditation', 1), ('visions', 1), ('children', 1), ('sermon', 1), ('physical', 1), ('pastored', 1), ('conversion', 1), ('six', 1), ('jarena', 1), ('richard', 1), ('stages', 1), ('worshiped', 1), ('philadelphia', 1), ('demons', 1), ('ecstasy', 1), ('reverend', 1), ('joseph', 1), ('various', 1), ('terrifying', 1), ('perdition', 1), ('white', 1), ('several', 1), ('hill', 1), ('vigor', 1), ('catholics', 1), ('bethel', 1), ('experienced', 1), ('protracted', 1), ('moved', 1), ('eternal', 1), ('depression', 1), ('africanamerican', 1), ('1804', 1), ('1811', 1), ('baptized', 1), ('feelings', 1), ('ennui', 1), ('died', 1), ('extreme', 1), ('1807', 1), ('fasting', 1), ('anx

# Stemming using NLTK

In [10]:
from nltk.stem.porter import *

stemmer = PorterStemmer()
tokens_stemmed = [stemmer.stem(token) for token in token_withOutStop]
count = Counter(tokens_stemmed)
print count.most_common(20)

[(u'methodist', 3), (u'lee', 3), (u'church', 2), (u'new', 2), (u'congreg', 2), (u'jersey', 2), (u'among', 1), (u'founder', 1), (u'ecstasi', 1), (u'feel', 1), (u'1811', 1), (u'move', 1), (u'period', 1), (u'terrifi', 1), (u'month', 1), (u'protract', 1), (u'children', 1), (u'fever', 1), (u'baptiz', 1), (u'infanc', 1)]


# Tf-Idf in Scikit-Learn
Unfortunately, calculating tf-idf is not available in NLTK so we'll use another data analysis library, scikit-learn. Scikit-learn has a built in Tf-Idf implementation and we can use NLTK's tokenizer and stemmer to preprocess the text.

In [11]:
# Import statements
import nltk
import string
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

In [12]:
## a) Read 'anb-jarena-lee.txt' file.
## b) Convert all characters to lower case.
## c) remove all punctuations from the text.
token_dict = {}
shakes = open('anb-jarena-lee.txt', 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)

print(no_punctuation)

in 1804 after several months of profound spiritual anxiety jarena lee
moved from new jersey to philadelphia there she labored as a domestic
and worshiped among white congregations of roman catholics and mixed
congregations of methodists on hearing an inspired sermon by the
reverend richard allen founder of the bethel african methodist
episcopal church lee joined the methodists she was baptized in 1807
prior to her baptism she experienced the various physical and emotional
stages of conversion terrifying visions of demons and eternal
perdition extreme feelings of ecstasy and depression protracted
periods of meditation fasting and prayer ennui and fever energy and
vigor in 1811 she married joseph lee who pastored an africanamerican
church in snow hill new jersey they had six children four of whom
died in infancy



In [13]:
#Store the pre-processed text to Token_dictionary variable as type files
token_dict[file] = no_punctuation

print token_dict
print(token_dict.values())    

{<type 'file'>: 'in 1804 after several months of profound spiritual anxiety jarena lee\nmoved from new jersey to philadelphia there she labored as a domestic\nand worshiped among white congregations of roman catholics and mixed\ncongregations of methodists on hearing an inspired sermon by the\nreverend richard allen founder of the bethel african methodist\nepiscopal church lee joined the methodists she was baptized in 1807\nprior to her baptism she experienced the various physical and emotional\nstages of conversion terrifying visions of demons and eternal\nperdition extreme feelings of ecstasy and depression protracted\nperiods of meditation fasting and prayer ennui and fever energy and\nvigor in 1811 she married joseph lee who pastored an africanamerican\nchurch in snow hill new jersey they had six children four of whom\ndied in infancy\n'}
['in 1804 after several months of profound spiritual anxiety jarena lee\nmoved from new jersey to philadelphia there she labored as a domestic\na

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
tf = CountVectorizer()

# learn the 'vocabulary' of the training data (occurs in-place)
tf.fit(token_dict.values())

# examine the fitted vocabulary
features = tf.get_feature_names()
features[1:10]


[u'1807',
 u'1811',
 u'african',
 u'africanamerican',
 u'after',
 u'allen',
 u'among',
 u'an',
 u'and']

In [16]:
# transform training data into a 'document-term matrix'
tf_dtm = tf.transform(token_dict.values())
# examine the vocabulary and document-term matrix together
pd.DataFrame(tf_dtm.toarray(), columns=tf.get_feature_names())


Unnamed: 0,1804,1807,1811,african,africanamerican,after,allen,among,an,and,...,they,to,various,vigor,visions,was,white,who,whom,worshiped
0,1,1,1,1,1,1,1,1,2,8,...,1,2,1,1,1,1,1,1,1,1


### Reading a text-based dataset into pandas

In [17]:
os.chdir("E:\\CPEEB25\\CSE7306c\\20170325_Batch25_Day2_TextMining\\data")
os.getcwd()
os.listdir(os.getcwd())

['anb-jarena-lee.txt', 'shakespeare-macbeth.txt', 'sms.tsv']

In [18]:
path = 'sms.tsv'
sms = pd.read_table(path, header=None, names=['label', 'message'])

In [19]:
# examine the shape
sms.shape

(5572, 2)

In [20]:
# examine the first 10 rows
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [21]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [22]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [23]:
# check that the conversion worked
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [24]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572L,)
(5572L,)


In [25]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179L,)
(1393L,)
(4179L,)
(1393L,)


#### Vectorizing our dataset

In [26]:
# instantiate the vectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm_a = vect.transform(X_train)

In [27]:
# equivalently: combine fit and transform into a single step
X_train_dtm_b = vect.fit_transform(X_train)

In [66]:
#print X_train_dtm_a
#print X_train_dtm_b

In [28]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

### Building and evaluating a model

We will use multinomial Naive Bayes:

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [29]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb_a = MultinomialNB()

In [30]:
# train the model using X_train_dtm_a
nb_a.fit(X_train_dtm_a, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
# make class predictions for X_test_dtm_a
y_pred_class_a = nb_a.predict(X_test_dtm)

In [32]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class_a)

0.98851399856424982

In [33]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class_a)

array([[1203,    5],
       [  11,  174]])

In [34]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb_b = MultinomialNB()

In [74]:
# train the model using X_train_dtm_a
nb_b.fit(X_train_dtm_b, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
# make class predictions for X_test_dtm
y_pred_class_b = nb_a.predict(X_test_dtm)

In [37]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class_a)

0.98851399856424982

In [38]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class_a)

array([[1203,    5],
       [  11,  174]])