# Natural language processing
# Challenge: Build your own NLP model

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn.model_selection import cross_val_score

## BoW

### Data Cleaning for BoW

In [2]:
# Grab and process the raw data.
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [4]:
moby = gutenberg.raw('melville-moby_dick.txt')
paradise = gutenberg.raw('milton-paradise.txt')

# Print the first 500 characters of Moby Dick.
print('\nRaw Moby Dick:\n', moby[0:500])

# Print the first 500 characters of Paradise
print('\nRaw Paradise:\n', paradise[0:500])


Raw Moby Dick:
 [Moby Dick by Herman Melville 1851]


ETYMOLOGY.

(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.  He was ever dusting his old lexicons and grammars, with a queer
handkerchief, mockingly embellished with all the gay flags of all the
known nations of the world.  He loved to dust his old grammars; it
somehow mildly reminded him of his mortality.

"While you take in hand to school others, and to teac

Raw Paradise:
 [Paradise Lost by John Milton 1667] 
 
 
Book I 
 
 
Of Man's first disobedience, and the fruit 
Of that forbidden tree whose mortal taste 
Brought death into the World, and all our woe, 
With loss of Eden, till one greater Man 
Restore us, and regain the blissful seat, 
Sing, Heavenly Muse, that, on the secret top 
Of Oreb, or of Sinai, didst inspire 
That shepherd who first taught the chosen seed 
In the beginning how the heavens and earth 
Rose out of Chaos: 

In [5]:
# This pattern matches all text between square brackets.
pattern = "[\[].*?[\]]"
moby = re.sub(pattern, "", moby)
paradise = re.sub(pattern, "", paradise)

# Now we'll match and remove chapter headings.
# The Chapter indicator is idiosyncratic
moby = re.sub(r'Chapter \d+', '', moby)
paradise = re.sub(r'Book .*', '', paradise)
    
# Ok, what's it look like now?
print('Chapter headings removed Moby Dick:\n', moby[0:100])

print('\nChapter headings removed Paradise:\n', paradise[0:100])

Chapter headings removed Moby Dick:
 


ETYMOLOGY.

(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--th

Chapter headings removed Paradise:
  
 
 

 
 
Of Man's first disobedience, and the fruit 
Of that forbidden tree whose mortal taste 
Br


### Create Features Using BoW

In [6]:
moby = text_cleaner(moby)
paradise = text_cleaner(paradise)

In [7]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
nlp.max_length = 2000000
moby_doc = nlp(moby)
paradise_doc = nlp(paradise)

In [8]:
# Group into sentences.
moby_sents = [[sent, "Melville"] for sent in moby_doc.sents]
paradise_sents = [[sent, "Milton"] for sent in paradise_doc.sents]

#Moby Dick is quite long, let's cut it down to the same length as Paradise.
moby_sents = moby_sents[0:len(paradise_sents)]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(moby_sents + paradise_sents)
sentences.head()

Unnamed: 0,0,1
0,"(ETYMOLOGY, .)",Melville
1,"((, Supplied, by, a, Late, Consumptive)",Melville
2,"(Usher, to, a, Grammar, School, ))",Melville
3,"(The, pale, Usher, threadbare, in, coat, ,, he...",Melville
4,"(;, I, see, him, now, .)",Melville


In [9]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
mobywords = bag_of_words(moby_doc)
paradisewords = bag_of_words(paradise_doc)

# Combine bags to create a set of unique words.
common_words = set(mobywords + paradisewords)

In [10]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500


Unnamed: 0,fame,restraint,cheek,speech,critical,lima,ill,mood,invariably,allude,...,dumb,dalliance,stroke,stir,desire,end,length,win,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ETYMOLOGY, .)",Melville
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"((, Supplied, by, a, Late, Consumptive)",Melville
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Usher, to, a, Grammar, School, ))",Melville
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, pale, Usher, threadbare, in, coat, ,, he...",Melville
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(;, I, see, him, now, .)",Melville


In [12]:
#Training and Test Sets 

from sklearn.model_selection import train_test_split
# Set variables.
Y_bow = word_counts['text_source']
X_bow = np.array(word_counts.drop(['text_sentence','text_source'], 1))

# Train, test split.
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow,
                                                                    Y_bow, 
                                                                    test_size=0.3, 
                                                                    random_state= None)

### Modeling Using Bow

#### Random Forest Classifier 

In [21]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier(max_depth=20,max_features='auto', n_estimators=100)
rfc.fit(X_train_bow, y_train_bow)

print('Training set score:', rfc.score(X_train_bow, y_train_bow))
print('\nTest set score:', rfc.score(X_test_bow, y_test_bow))

cv_train = cross_val_score(rfc, X_train_bow, y_train_bow, cv=5)

### Put this with the Cros validation score.
plusminus = u"\u00B1"
 
print('\nCross validation results: {:.3%} {} {:.3%} \n \n {}'.format(cv_train.mean(), plusminus, 2*cv_train.std(), cv_train))


Training set score: 0.888198757764

Test set score: 0.838768115942

Cross validation results: 85.507% ± 1.690% 
 
 [ 0.84864166  0.84217335  0.86287193  0.86416559  0.85751295]


This is overfitting but fitting well on the Training set

#### Logistic Regression : Lasso

In [23]:
from sklearn.linear_model import LogisticRegression

ridgeregr = LogisticRegression(penalty='l1')
ridgeregr.fit(X_train_bow, y_train_bow)

print('Training set score:', ridgeregr.score(X_train_bow, y_train_bow))
print('\nTest set score:', ridgeregr.score(X_test_bow, y_test_bow))

cv_train = cross_val_score(ridgeregr, X_train_bow, y_train_bow, cv=5)

### Put this with the Cros validation score.
plusminus = u"\u00B1"
 
print('\nCross validation results: {:.3%} {} {:.3%} \n \n {}'.format(cv_train.mean(), plusminus, 2*cv_train.std(), cv_train))


Training set score: 0.937370600414

Test set score: 0.878019323671

Cross validation results: 87.966% ± 1.403% 
 
 [ 0.87063389  0.88486417  0.87839586  0.89003881  0.87435233]


This is overfitting but fitting well on the training set.   
The standard deviation is high compare to the random forest classifier

## tf-idf

### Data Cleaning for tf-idf

In [15]:
#reading in the data, this time in the form of paragraphs
moby_p =gutenberg.paras('melville-moby_dick.txt')
#processing
moby_paras=[]
for paragraph in moby_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    moby_paras.append(' '.join(para))

print('\nMoby Dick:\n',moby_paras[0:4])


paradise_p =gutenberg.paras('milton-paradise.txt')
#processing
paradise_paras=[]
for paragraph in paradise_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    paradise_paras.append(' '.join(para))

print('\nParadise:\n',paradise_paras[0:4])


Moby Dick:
 ['[ Moby Dick by Herman Melville 1851 ]', 'ETYMOLOGY .', '( Supplied by a Late Consumptive Usher to a Grammar School )', 'The pale Usher  threadbare in coat , heart , body , and brain ; I see him now .']

Paradise:
 ['[ Paradise Lost by John Milton 1667 ]', 'Book I', "Of Man ' s first disobedience , and the fruit Of that forbidden tree whose mortal taste Brought death into the World , and all our woe , With loss of Eden , till one greater Man Restore us , and regain the blissful seat , Sing , Heavenly Muse , that , on the secret top Of Oreb , or of Sinai , didst inspire That shepherd who first taught the chosen seed In the beginning how the heavens and earth Rose out of Chaos : or , if Sion hill Delight thee more , and Siloa ' s brook that flowed Fast by the oracle of God , I thence Invoke thy aid to my adventurous song , That with no middle flight intends to soar Above th ' Aonian mount , while it pursues Things unattempted yet in prose or rhyme .", 'Book II']


In [16]:
# Group into sentences.
moby_sents_tfidf = [[sent, "Melville"] for sent in moby_paras]
paradise_sents_tfidf = [[sent, "Milton"] for sent in paradise_paras]

# Combine the sentences from the two novels into one data frame.
sentences_tfidf = pd.DataFrame(moby_sents_tfidf + paradise_sents_tfidf)
sentences_tfidf.head()

Unnamed: 0,0,1
0,[ Moby Dick by Herman Melville 1851 ],Melville
1,ETYMOLOGY .,Melville
2,( Supplied by a Late Consumptive Usher to a Gr...,Melville
3,"The pale Usher threadbare in coat , heart , b...",Melville
4,""" While you take in hand to school others , an...",Melville


### Create Feature Using tf-idf

In [17]:
# Training and Test Sets 

from sklearn.feature_extraction.text import TfidfVectorizer

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(sentences_tfidf[0],
                                                                            sentences_tfidf[1], 
                                                                            test_size=0.4, 
                                                                            random_state=None)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
X_train_tfidf=vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf=vectorizer.transform(X_test_tfidf)



### Modeling Using tf-idf

#### Random Forest Classifier 

In [18]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train_tfidf, y_train_tfidf)

print('Training set score:', rfc.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', rfc.score(X_test_tfidf, y_test_tfidf))

cv_train = cross_val_score(rfc, X_train_tfidf, y_train_tfidf, cv=5)

### Put this with the Cros validation score.
plusminus = u"\u00B1"
 
print('\nCross validation results: {:.3%} {} {:.3%} \n \n {}'.format(cv_train.mean(), plusminus, 2*cv_train.std(), cv_train))


Training set score: 0.995274660366

Test set score: 0.990256864482

Cross validation results: 99.055% ± 0.444% 
 
 [ 0.99115044  0.99410029  0.99115044  0.98816568  0.98816568]


#### Logistic Regression : Ridge

In [19]:
from sklearn.linear_model import LogisticRegression

ridgeregr = LogisticRegression()
ridgeregr.fit(X_train_tfidf, y_train_tfidf)

print('Training set score:', ridgeregr.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', ridgeregr.score(X_test_tfidf, y_test_tfidf))


cv_train = cross_val_score(ridgeregr, X_train_tfidf, y_train_tfidf, cv=5)

### Put this with the Cros validation score.
plusminus = u"\u00B1"
 
print('\nCross validation results: {:.3%} {} {:.3%} \n \n {}'.format(cv_train.mean(), plusminus, 2*cv_train.std(), cv_train))


Training set score: 0.989367985824

Test set score: 0.990256864482

Cross validation results: 98.937% ± 0.286% 
 
 [ 0.98820059  0.98820059  0.98820059  0.99112426  0.99112426]


We can see that using tf-idf in comparison to BoW is giving us the better results.   

### Improve on one of the model 
Pick one of the models and try to increase accuracy by at least 5 percentage points.

From the work above, it looks like the Ridge Regression and Random Forest Classifier performs about the same  using features from __tf-idf__.  
Using BoW, I will try to improve on what we have on the random Forest Classifier.    

#### Random Forest Classifier 

In [26]:

rfc = ensemble.RandomForestClassifier(max_depth=50,max_features='auto', n_estimators=200)
rfc.fit(X_train_bow, y_train_bow)

print('Training set score:', rfc.score(X_train_bow, y_train_bow))
print('\nTest set score:', rfc.score(X_test_bow, y_test_bow))

cv_train = cross_val_score(rfc, X_train_bow, y_train_bow, cv=5)

### Put this with the Cros validation score.
plusminus = u"\u00B1"
 
print('\nCross validation results: {:.3%} {} {:.3%} \n \n {}'.format(cv_train.mean(), plusminus, 2*cv_train.std(), cv_train))


Training set score: 0.930383022774

Test set score: 0.852657004831

Cross validation results: 86.258% ± 1.449% 
 
 [ 0.85510996  0.85640362  0.85899094  0.86934023  0.87305699]


We were able to improve our result.