In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

import warnings
# Suppressing annoying harmless error
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

from warnings import simplefilter
warnings.simplefilter('ignore')
simplefilter(action='ignore', category=FutureWarning)

# Classifiers evaluation metrics
from sklearn.metrics import accuracy_score, roc_auc_score, auc, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef

# Libraries for text tf-idf classfication and test-train split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
# This is the utility function for standard text cleaning
# Removes any double dashes in the text such as '--'
# Removes any special characters such as left and right brackets ([]), and
# retains alpha-numeric text only
# Returns the first 900,000 words because of limited processing power in my local machine
#

def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--', ' ', text)
    text = re.sub("[\[].*?[\]]", "", text)
    return text


def read_till_end(filename):
    # Beyond the last chapter of the book there are some extra text about the author, publisher and other
    # information that are not related to the main story.
    # As these texts do not pertain to the main story we want to strip them off the text
    # All books have 'THE END' indicating end of the story, after which some additional text
    # appears about gutenberg organization

    f = open(filename, 'r')
    string = "THE END"
    return_string = ''
    finished = False
    for line in f:
        if not finished:
            if line.find(string) == -1:
                return_string += line
            else:
                finished = True
                return return_string


def strip_intro(text):
    # There is also some text at the very beginning of the book that are not related to the story. For example
    # licensing method, publisher's name etc. These should be stripped as well.

    content_pos = text.find('CONTENTS') + 9
    return text[content_pos:]

# Import the following 6 novels in .txt form (imported from https://www.gutenberg.org/browse/scores/top)
# Huckleberry Finn - Mark Twain
# Tom Sawyer - Mark Twain
# Ulysses - James Joyce
# Tale of Two Cities - Charles Dickens
# Jekyll & Hyde - R L Stevenson, and
# Importance of being earnest - Oscal Wilde
#

# Intitlize the strings where book texts will be stored

h_finn = ""  # Huckleberry Finn
t_sawyer = ""  # Tom Sawyer
ulysses = ""  # Ulysses
t_cities = ""  # Tale of Two Cities
j_and_h = ""  # Jekyll and Hyde
oscar_w = ""  # Importance of being earnest - Oscar Wilde

# Read and strip unwanted text in Huckleberry Finn
filename = 't_sawyer_h_finn.txt'
h_finn = read_till_end(filename)
h_finn = strip_intro(h_finn)

# Read and strip unwanted text in Tom Sawyer
filename = 't_sawyer_twain.txt'
t_sawyer = read_till_end(filename)
t_sawyer = strip_intro(t_sawyer)

# Read and strip unwanted text in Ulysses
filename = 'j_joyce_ulysses.txt'
ulysses = read_till_end(filename)
ulysses = strip_intro(ulysses)

# Read and strip unwanted text in Tale of Two Cities
filename = 'c_dickens.txt'
t_cities = read_till_end(filename)
t_cities = strip_intro(t_cities)

# Read and strip unwanted text in J & H
filename = 'rl_stevenson.txt'
j_and_h = read_till_end(filename)
j_and_h = strip_intro(j_and_h)

# Read and strip unwanted text in Oscar Wilde novel
filename = 'oscar_wilde.txt'
oscar_w = read_till_end(filename)
oscar_w = strip_intro(oscar_w)

# Clean the data.
h_finn_clean = text_cleaner(h_finn)
t_sawyer_clean = text_cleaner(t_sawyer)
ulysses_clean = text_cleaner(ulysses)
t_cities_clean = text_cleaner(t_cities)
j_and_h_clean = text_cleaner(j_and_h)
oscar_w_clean = text_cleaner(oscar_w)

In [3]:
nlp = spacy.load('en')
j_and_h_doc = nlp(j_and_h_clean)

In [4]:
# For word2vec verification, organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in j_and_h_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(j_and_h_doc)))

['in', 'character', 'frequently', 'fortune', '\n', 'reputable', 'acquaintance', 'good', 'influence', '\n', 'life', 'downgo', 'man']
We have 1805 sentences and 33006 tokens.


In [5]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    j_and_h_doc,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

TypeError: 'spacy.tokens.token.Token' object is not iterable

In [None]:
# List of words in model.
vocab = model.wv.vocab.keys()

#print(model.wv.most_similar(
#    positive=['lady', 'man'], negative=['woman']), '\n')

# Similarity is calculated using the cosine
# So a 1 is total similarity and 0 is no similarity
print('R L Stevenson similarity ', model.wv.similarity('screaming', 'resistance'))

# One of these things is not like the other...
#print('Word different in breakfast/marriage/dinner/lunch is =',
#      model.doesnt_match("breakfast marriage dinner lunch".split()))