In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [4]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
#dataframe practice
list1 = [["abc", "A"], ["bcd", "A"]]
list2 = [["ddd", "B"], ["rrr", "B"]]

df_sample = pd.DataFrame(list1 + list2)
df_sample

Unnamed: 0,0,1
0,abc,A
1,bcd,A
2,ddd,B
3,rrr,B


In [6]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [7]:
len(sentences)

6013

In [8]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_.lower()
                for token in text
                if not token.is_punct
                # simply calling token.is_stop mixes stop words in uppercase
                #and not token.is_stop and token.lemma_ != "-PRON-"]
                and not token.lemma_.lower() in stopwords.words('english')
                and not token.is_digit
                and token.lemma_ != "-PRON-"]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]


# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [9]:
# # tokenization practice
# for token in alice_doc[190:220]:
#     if token.lemma_.lower() in stopwords.words('english'):
#         print(token.lower_, " is stop word")
#     else:
#         print(token.lower_)
# check if any stop words are mixed
np.sum([word in stopwords.words('english') for word in alicewords])

0

In [10]:
common_words.remove("`")
common_words.remove("'s")

In [11]:
# Use custom tokenizer and lemmatizer
# a spacy span object is just a sentence nothing fancy
def lemmatizer_tokenize(spacy_span_input):
    words = [token.lemma_ for token in spacy_span_input
                if (
                    not token.is_digit and
                    not token.is_punct and 
                    not token.is_stop and 
                    token.lemma_ in common_words
                 )]
    return words

In [12]:
# Optimizing the code by using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# these are stop words used by spacy
# stopwords.words('english'))
vectorizer = CountVectorizer(vocabulary=common_words, stop_words=stopwords.words('english'),
                             tokenizer=lemmatizer_tokenize, lowercase=False)
bow_features = pd.DataFrame(vectorizer.transform(sentences[0]).toarray(), columns = vectorizer.get_feature_names())
bow_features.head()

Unnamed: 0,abide,ability,able,abroad,absence,absent,absolute,absolutely,absurd,abuse,...,yes,yesterday,yet,yield,young,youth,zeal,zealand,zealous,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
vectorizer.get_feature_names()[:10]

['abide',
 'ability',
 'able',
 'abroad',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absurd',
 'abuse']

In [14]:
len(common_words)

3012

In [15]:
bow_features.shape

(6013, 3012)

In [16]:
sentences.shape

(6013, 2)

In [17]:
bow_features.values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = sentences[1]
X = bow_features.values

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))



Training set score: 0.9561962850013862

Test set score: 0.7834580216126351


In [19]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(3607, 3012) (3607,)
Training set score: 0.9026892154144719

Test set score: 0.8316708229426434


In [20]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8114776822844469

Test set score: 0.773067331670823


In [21]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])

# Parse our cleaned data.
emma_doc = nlp(emma)

# Parse our cleaned data.
emma_doc = nlp(emma)
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
#emma_bow = bow_features(emma_sentences, common_words)

# Optimizing the code by using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# these are stop words used by spacy
# stopwords.words('english'))
vectorizer = CountVectorizer(vocabulary=common_words, stop_words=stopwords.words('english'),
                             tokenizer=lemmatizer_tokenize, lowercase=False)
bow_features_ema = pd.DataFrame(vectorizer.transform(emma_sentences[0]).toarray(), columns = vectorizer.get_feature_names())
bow_features_ema.head()

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


Unnamed: 0,abide,ability,able,abroad,absence,absent,absolute,absolutely,absurd,abuse,...,yes,yesterday,yet,yield,young,youth,zeal,zealand,zealous,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
bow_features_ema.shape

(1716, 3012)

In [23]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
# we are using the train set on the code below
# I don't know why it says testset
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    bow_features_ema.values
    
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * bow_features_ema.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6689655172413793


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1614,102
Carroll,810,229


## Challenge 0

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=common_words, stop_words=stopwords.words('english'),
                             tokenizer=lemmatizer_tokenize, lowercase=False)
bow_features = pd.DataFrame(vectorizer.transform(sentences[0]).toarray(), columns = vectorizer.get_feature_names())
bow_features.head()

Unnamed: 0,abide,ability,able,abroad,absence,absent,absolute,absolutely,absurd,abuse,...,yes,yesterday,yet,yield,young,youth,zeal,zealand,zealous,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
bow_features["Num_of_Words"] = bow_features.sum(axis=1)
punc_count = []
cconcj_count = []
num_count = []
particle_count = []
the_poss = []
for sent in sentences[0]:
    pcount = 0
    cccount = 0
    ncount = 0
    pacount = 0
    for token in sent:
        the_poss.append(token.pos_)
        if(token.pos_ == 'PUNCT'):
            pcount += 1
        if(token.pos_ == "CCONJ"):
            cccount += 1
        if(token.pos_ == "NUM"):
            ncount += 1
        if(token.pos_ == "PART"):
            pacount += 1
    punc_count.append(pcount)
    cconcj_count.append(cccount)
    num_count.append(ncount)
    particle_count.append(pacount)

#print(punc_count[0:10])
bow_features["punctuation_count"] = punc_count
bow_features["conjuction_count"] = cconcj_count
bow_features["Numeric_count"] = num_count
bow_features["Particle_count"] = particle_count
#bow_features["previous_state_num_words"] = bow_features.Num_of_Words.shift(1).fillna(0)
#bow_features["next_state_num_words"] = bow_features.Num_of_Words.shift(-1).fillna(0)

Counter(the_poss)

Counter({'PROPN': 6392,
         'VERB': 24103,
         'PART': 3510,
         'ADV': 10136,
         'ADJ': 7276,
         'ADP': 13823,
         'DET': 13470,
         'NOUN': 15649,
         'PUNCT': 21301,
         'CCONJ': 4949,
         'PRON': 10792,
         'INTJ': 378,
         'NUM': 731,
         'AUX': 211,
         'X': 1})

In [26]:
# split into train and test set
Y = sentences[1]
X = bow_features.values

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

In [27]:
lr = LogisticRegression(C=1.5)
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))



(3607, 3017) (3607,)
Training set score: 0.9179373440532298

Test set score: 0.8391521197007481


In [28]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC()
train = lsvc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lsvc.score(X_train, y_train))
print('\nTest set score:', lsvc.score(X_test, y_test))



(3607, 3017) (3607,)
Training set score: 0.9478791239257001

Test set score: 0.8395677472984207


## Challenge 1

In [29]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])
# Parse our cleaned data.
emma_doc = nlp(emma)
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
emma_sents = emma_sents[0:len(alice_sents)]

# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
#emma_bow = bow_features(emma_sentences, common_words)

# Optimizing the code by using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# these are stop words used by spacy
# stopwords.words('english'))
vectorizer = CountVectorizer(vocabulary=common_words, stop_words=stopwords.words('english'),
                             tokenizer=lemmatizer_tokenize, lowercase=False)
bow_features_ema = pd.DataFrame(vectorizer.transform(emma_sentences[0]).toarray(), columns = vectorizer.get_feature_names())
bow_features_ema.head()

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


Unnamed: 0,abide,ability,able,abroad,absence,absent,absolute,absolutely,absurd,abuse,...,yes,yesterday,yet,yield,young,youth,zeal,zealand,zealous,zigzag
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
bow_features_ema["Num_of_Words"] = bow_features_ema.sum(axis=1)
punc_count = []
cconcj_count = []
num_count = []
particle_count = []
the_poss = []
for sent in emma_sentences[0]:
    pcount = 0
    cccount = 0
    ncount = 0
    pacount = 0
    for token in sent:
        the_poss.append(token.pos_)
        if(token.pos_ == 'PUNCT'):
            pcount += 1
        if(token.pos_ == "CCONJ"):
            cccount += 1
        if(token.pos_ == "NUM"):
            ncount += 1
        if(token.pos_ == "PART"):
            pacount += 1
    punc_count.append(pcount)
    cconcj_count.append(cccount)
    num_count.append(ncount)
    particle_count.append(pacount)

#print(punc_count[0:10])
bow_features_ema["punctuation_count"] = punc_count
bow_features_ema["conjuction_count"] = cconcj_count
bow_features_ema["Numeric_count"] = num_count
bow_features_ema["Particle_count"] = particle_count
#bow_features["previous_state_num_words"] = bow_features.Num_of_Words.shift(1).fillna(0)
#bow_features["next_state_num_words"] = bow_features.Num_of_Words.shift(-1).fillna(0)

Counter(the_poss)

Counter({'PROPN': 1360,
         'PUNCT': 4533,
         'ADJ': 1983,
         'CCONJ': 1212,
         'ADP': 3043,
         'DET': 3077,
         'NOUN': 3735,
         'VERB': 5617,
         'PART': 771,
         'ADV': 2474,
         'NUM': 159,
         'PRON': 2764,
         'AUX': 27,
         'INTJ': 106})

In [31]:
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
# we are using the train set on the code below
# I don't know why it says testset
X_Emma_test = np.concatenate((
    X_train[y_train[y_train=='Carroll'].index],
    bow_features_ema.values
    
), axis=0)
y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * bow_features_ema.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.6809437386569873


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1621,95
Carroll,784,255


In [None]:
# Clean the Emma data.
caesar = gutenberg.raw('shakespeare-caesar.txt')
caesar = re.sub(r'VOLUME \w+', '', caesar)
caesar = re.sub(r'CHAPTER \w+', '', caesar)
caesar = text_cleaner(caesar)
print(caesar[:100])
# Parse our cleaned data.
ceasar_doc = nlp(caesar)
# Group into sentences.
#persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
ceasar_sents = [[sent, "Caesar"] for sent in ceasar_doc.sents]

# Emma is quite long, let's cut it down to the same length as Alice.
ceasar_sents = emma_sents[0:len(alice_sents)]

ceasar_sentences = pd.DataFrame(ceasar_sents)
#emma_bow = bow_features(emma_sentences, common_words)

# Optimizing the code by using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# these are stop words used by spacy
# stopwords.words('english'))
vectorizer = CountVectorizer(vocabulary=common_words, stop_words=stopwords.words('english'),
                             tokenizer=lemmatizer_tokenize, lowercase=False)
bow_features_ceasar = pd.DataFrame(vectorizer.transform(ceasar_sentences[0]).toarray(), columns = vectorizer.get_feature_names())
bow_features_ceasar.head()

Actus Primus. Scoena Prima. Enter Flauius, Murellus, and certaine Commoners ouer the Stage. Flauius.
