# Building Supervised NLP Models

In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import nltk
from nltk.corpus import gutenberg, stopwords
import spacy

from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
bible = gutenberg.raw('bible-kjv.txt')
melville = gutenberg.raw('melville-moby_dick.txt')

print('\nRaw:\n', bible[0:150])


Raw:
 [The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the hea


In [4]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text) # Removing potential dashes
    text = re.sub("[\[].*?[\]]", "", text) # This pattern matches all text between square brackets.
    text = re.sub(r'The Old Testament of the King James Bible','', text) # Removing the heading
    text = re.sub(r'\d+:\d+ ','', text) # Removing all references
    text = ' '.join(text.split()) # Removing excess white space
    return text

# Clean the Bible data.
bible = text_cleaner(bible)

# Clean the Milton data.
melville = re.sub(r'VOLUME \w+', '', melville)
melville = re.sub(r'CHAPTER \w+', '', melville)
melville = text_cleaner(melville)

# Print the first 400 characters of Bible 
print('Preview:\n', bible[0:400])

Preview:
 The First Book of Moses: Called Genesis In the beginning God created the heaven and the earth. And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters. And God said, Let there be light: and there was light. And God saw the light, that it was good: and God divided the light from the darkness. And God called the li


### Model 1: Logistic Regression with BoW

In [5]:
# Parse the cleaned corpuses
nlp = spacy.load('en')
bible_doc = nlp(bible[:25000])        # Restricting data size: original is over 3,000,000
print('Bible: Done!')

melville_doc = nlp(melville[:25000])      # Restricting data size: original is over 400,000
print('Paradise: Done!')

Bible: Done!
Paradise: Done!


In [6]:
# Group into sentences.
bible_sents = [[sent, "Bible"] for sent in bible_doc.sents]
melville_sents = [[sent, "Milton"] for sent in melville_doc.sents]

# Combine the sentences from the two corpuses into one data frame.
sentences = pd.DataFrame(bible_sents + melville_sents)
sentences.head()

Unnamed: 0,0,1
0,"(The, First, Book, of, Moses, :, Called, Genesis)",Bible
1,"(In, the, beginning, God, created, the, heaven...",Bible
2,"(And, the, earth, was, without, form, ,, and, ...",Bible
3,"(And, the, Spirit, of, God, moved, upon, the, ...",Bible
4,"(And, God, said, ,, Let, there, be, light, :, ...",Bible


In [7]:
def bag_of_words(text):
    """Utility function to create a list of the 2000 most common words."""
    allwords = [token.lemma_                    # Filter out punctuation and stop words.
                for token in text
                if not token.is_punct
                and not token.is_stop]
    return [item[0] for item in Counter(allwords).most_common(2000)]     # Return the most common words.
    
def bow_features(sentences, common_words):
    """Create DataFrame with features for each word in common word set where value is count of appearances in sentence."""
    df = pd.DataFrame(columns=common_words)     # Scaffold the data frame
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0                 # Initiliaze counts to zero
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        words = [token.lemma_                   # Convert the sentence to lemmas
                 for token in sentence
                 if (
                     not token.is_punct         # Filter punctuation, stop words, and uncommon words
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        for word in words:                      # Populate the row with word counts.
            df.loc[i, word] += 1
        if i % 50 == 0:                         # Counter to ensure kernel doesn't hang
            print("Processing row {}".format(i))
    return df

# Set up the bags.
biblewords = bag_of_words(bible_doc)
melvillewords = bag_of_words(melville_doc)

# Combine bags to create a set of unique words.
common_words = set(biblewords + melvillewords)

In [8]:
# Creating features for sentences. Takes a few minutes to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700


Unnamed: 0,asswage,wish,13,serpent,float,board,shady,limit,TOOKE,Mr.,...,extremity,content,sw,Heaven,true,picture,give,vent,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, First, Book, of, Moses, :, Called, Genesis)",Bible
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(In, the, beginning, God, created, the, heaven...",Bible
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, the, earth, was, without, form, ,, and, ...",Bible
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, the, Spirit, of, God, moved, upon, the, ...",Bible
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, God, said, ,, Let, there, be, light, :, ...",Bible


In [9]:
y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

lr = LogisticRegression(penalty='l2')
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:\t', lr.score(X_train, y_train))
print('Test set score:\t\t', lr.score(X_test, y_test))



(421, 1680) (421,)
Training set score:	 0.9833729216152018
Test set score:		 0.9466192170818505


In [10]:
coef_scores = pd.DataFrame()
coef_scores['features'] = word_counts.drop(['text_sentence','text_source'], 1).columns
coef_scores['coefficients'] = lr.coef_[0,:]

coef_scores[coef_scores.coefficients == coef_scores.coefficients.max()]

Unnamed: 0,features,coefficients
749,whale,1.017759


### Model 2: Logistic Regression with TF-IDF

In [11]:
bible_paragraphs = gutenberg.paras('bible-kjv.txt')
melville_paragraphs = gutenberg.paras('melville-moby_dick.txt')

In [12]:
bible_paras=[]
melville_paras=[]

def paragraph_split(raw_corpus, split_corpus):
    """Splits the paragraph corpus into a list of separate paragraphs."""
    for paragraph in raw_corpus:
        para=paragraph[0]
        para=[re.sub(r'--','',word) for word in para]
        split_corpus.append(' '.join(para))

paragraph_split(bible_paragraphs, bible_paras)   
paragraph_split(melville_paragraphs, melville_paras)

In [13]:
bible_df = [[para, "Bible"] for para in bible_paras]
melville_df = [[para, "Melville"] for para in melville_paras]

In [14]:
# Combine the paragraphs from the two corpuses into one data frame.
paragraphs = pd.DataFrame(bible_df + melville_df, columns=['paragraph','source'])
paragraphs.head()

Unnamed: 0,paragraph,source
0,[ The King James Bible ],Bible
1,The Old Testament of the King James Bible,Bible
2,The First Book of Moses : Called Genesis,Bible
3,1 : 1 In the beginning God created the heaven ...,Bible
4,"1 : 2 And the earth was without form , and voi...",Bible


In [15]:
y = paragraphs['source']
X = list(paragraphs['paragraph'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [16]:
vectorizer = TfidfVectorizer(max_df=0.50, # drop words in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Correction factor that treat long & short paragraphs equally
                             smooth_idf=True #Adds 1 to all document frequencies. Prevents divide-by-zero errors
                            )
feature_reduction = TruncatedSVD(1000)
classifier = LogisticRegression()

text_clf = Pipeline([('vect', vectorizer),
                     ('svd', feature_reduction),
                     ('clf', classifier), ])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print("Accuracy of predicted: ", np.mean(predicted == y_test))



Accuracy of predicted:  0.9620472584618192


### Cross-Validation for Two Models

In [19]:
# Cross-Validation for Logistic Regression of BoW
y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
lr = LogisticRegression(penalty='l2')

scores = cross_val_score(lr, X, y, cv=5)
print(scores)
print(scores.mean())



[0.86524823 0.88652482 0.84397163 0.88571429 0.96402878]
0.8890975487087533


In [20]:
# Cross-Validation for Logistic Regression pipeline using TF-IDF
y = paragraphs['source']
X = list(paragraphs['paragraph'])

scores = cross_val_score(text_clf, X, y, cv=5)
print(scores)
print(scores.mean())



[0.95274585 0.9649699  0.96734173 0.96678226 0.96568717]
0.9635053799984477


In conclusion, our TF-IDF pipeline using the Logistic Regression model easily outperformed the other Logistic Regression using BoW. The model does take considerably longer to run; however, it ultimately is a more accurate model with considerably less variance among the cross-validation folds.