In [19]:
import numpy as np
import nltk
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
 
# Load data
data_folder = r"./books/"
files = sorted(glob.glob(os.path.join(data_folder, "chapter*.txt")))
chapters = []
for fn in files:
    with open(fn) as f:
        chapters.append(f.read().replace('\n', ' '))
all_text = ' '.join(chapters)

In [20]:
import nltk

In [21]:
print all_text

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [22]:
for num, ch_text in enumerate(chapters):
    ch_text = ch_text.decode('utf-8','ignore')
    chapters[num] = ch_text
all_text = all_text.decode('utf-8','ignore')

In [23]:
# create feature vectors
num_chapters = len(chapters)
fvs_lexical = np.zeros((len(chapters), 3), np.float64)
fvs_punct = np.zeros((len(chapters), 3), np.float64)
for e, ch_text in enumerate(chapters):
    # note: the nltk.word_tokenize includes punctuation
    

    tokens = nltk.word_tokenize(ch_text.lower())
    words = word_tokenizer.tokenize(ch_text.lower())
    sentences = sentence_tokenizer.tokenize(ch_text)
    vocab = set(words)
    words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                   for s in sentences])
 
    # average number of words per sentence
    fvs_lexical[e, 0] = words_per_sentence.mean()
    # sentence length variation
    fvs_lexical[e, 1] = words_per_sentence.std()
    # Lexical diversity
    fvs_lexical[e, 2] = len(vocab) / float(len(words))
 
    # Commas per sentence
    fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
    # Semicolons per sentence
    fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
    # Colons per sentence
    fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))
 
# apply whitening to decorrelate the features
fvs_lexical = whiten(fvs_lexical)
fvs_punct = whiten(fvs_punct)

In [24]:
i = 0
for chapter in chapters:
    print(i,':',chapter[:40])
    i+=1

(0, ':', u' 10-K 1 a2016form10-k.htm FORM 10-K     ')
(1, ':', u' 10-K 1 a2015form10-k.htm FORM 10-K     ')
(2, ':', u' 10-K 1 a2014form10-kq42014.htm FORM 10-')
(3, ':', u' 10-K 1 a2013form10-kq42013.htm FORM 10-')
(4, ':', u' 10-K 1 amzn-20161231x10k.htm FORM 10-K ')
(5, ':', u' 10-K 1 amzn-20151231x10k.htm FORM 10-K ')
(6, ':', u' 10-K 1 amzn-20141231x10k.htm FORM 10-K ')
(7, ':', u' 10-K 1 amzn-20131231x10k.htm FORM 10-K ')


In [25]:
print(fvs_lexical)
print(fvs_punct)

[[ 13.13704919  19.41564851   3.40078282]
 [ 13.04144336  19.11711712   3.48684797]
 [ 13.00195036  19.08429878   3.77173869]
 [ 12.76193038  18.70033585   3.69637967]
 [ 15.08392111  17.10007293   5.49206889]
 [ 14.79174992  16.82074908   5.43014283]
 [ 14.87933335  17.04965492   5.4731785 ]
 [ 15.11225691  17.68816674   5.85598941]]
[[ 5.10424566  5.39541748  5.20753883]
 [ 5.1632636   5.53303653  5.06911569]
 [ 5.1578059   5.44373908  4.77052986]
 [ 4.91566093  5.22923259  4.35303042]
 [ 7.26871103  6.56215711  2.7929527 ]
 [ 6.96301008  6.25905929  2.70419361]
 [ 6.99313078  6.72362715  2.94898217]
 [ 7.06733365  8.43264667  3.26638949]]


In [26]:
# get most common words in the whole book
NUM_TOP_WORDS = 20
all_tokens = nltk.word_tokenize(all_text)
all_tokens_lst = [nltk.word_tokenize(chapter) for chapter in chapters]

print(all_tokens[:10])
fdist = nltk.FreqDist(all_tokens)
vocab = fdist.keys()[:NUM_TOP_WORDS]

fdist_lst = [nltk.FreqDist(all_tokens_chapter) for all_tokens_chapter in all_tokens_lst]
vocab_lst = []

for fdist in fdist_lst:
    vocab = fdist.keys()[:NUM_TOP_WORDS]
    vocab_lst.append(vocab)

# print(vocab_lst)
import itertools
vocab = itertools.chain(*vocab_lst)
vocab = set(vocab)
vocab = list(vocab)

# use sklearn to create the bag for words feature vector for each chapter
vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)
print(vectorizer.fit_transform(chapters).toarray().astype(np.float64))
fvs_bow = vectorizer.fit_transform(chapters).toarray().astype(np.float64)

# normalise by dividing each row by its Euclidean norm
# print(np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]==0)
fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]

[u'10-K', u'1', u'a2016form10-k.htm', u'FORM', u'10-K', u'Document', u'UNITED', u'STATES', u'SECURITIES', u'AND']
[[  0.   1.   1.   1.   0.   0.   2.   0.   0.   1.   1.   0.   0.   1.
   13.   1.   0.   0.   0.   0.   0.   1.   0.   0.   3.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   9.   0.   0.   0.   0.   1.   1.
    0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   1.   0.
    0.   1.   0.   0.   5.   2.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   3.   1.   1.   0.   1.   2.   0.   0.   0.   0.   0.   0.   1.
   14.   2.   1.   0.   0.   1.   0.   0.   0.   0.   4.   0.   0.   0.
    0.   0.   0.   1.   0.   0.   0.   8.   0.   0.   0.   0.   0.   1.
    0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   1.   0.
    0.   1.   0.   0.   5.   2.   1.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   1.   1.   0.   3.   1.   0.   0.   0.   0.   0.   0.   0.
   15.   2.   1.   0.   0.   1.   0.   0.   0.   0.   5.   0.   1.   1.
    0.   0.   0.   0.   0.   0

In [27]:
print(fvs_bow)

[[ 0.          0.05725983  0.05725983  0.05725983  0.          0.
   0.11451967  0.          0.          0.05725983  0.05725983  0.          0.
   0.05725983  0.74437783  0.05725983  0.          0.          0.          0.
   0.          0.05725983  0.          0.          0.1717795   0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.5153385   0.          0.          0.          0.
   0.05725983  0.05725983  0.          0.          0.          0.          0.
   0.          0.05725983  0.          0.          0.          0.          0.
   0.05725983  0.          0.          0.05725983  0.          0.
   0.28629917  0.11451967  0.          0.          0.          0.          0.
   0.          0.        ]
 [ 0.          0.1641527   0.05471757  0.05471757  0.          0.05471757
   0.10943513  0.          0.          0.          0.          0.          0.
   0.05471757  0.76604592  0.10943513  0.05471757  0.          0.
   0.054717

In [28]:
# get part of speech for each token in each chapter
from nltk.data import load
def token_to_pos(ch):
    tokens = nltk.word_tokenize(ch)
    return [p[1] for p in nltk.pos_tag(tokens)]
chapters_pos = [token_to_pos(ch) for ch in chapters]
 
# count frequencies for common POS types
pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
# tagdict = load('help/tagsets/upenn_tagset.pickle')
# pos_list = tagdict.keys()

fvs_syntax = np.array([[ch.count(pos) for pos in pos_list]
                       for ch in chapters_pos]).astype(np.float64)

print(fvs_syntax)
# normalise by dividing each row by number of tokens in the chapter
fvs_syntax /= np.c_[np.array([len(ch) for ch in chapters_pos])]

[[ 13335.  14054.   6433.  11021.   6915.   6844.]
 [ 12918.  13369.   6249.  10596.   6636.   6588.]
 [ 11686.  12416.   5529.   9806.   6250.   6372.]
 [ 12172.  13033.   5932.  10247.   6372.   6579.]
 [  6229.   4042.   1983.   4705.   3615.   4182.]
 [  6240.   3898.   1988.   4707.   3692.   4185.]
 [  6348.   4165.   2047.   4849.   3708.   4227.]
 [  5868.   4161.   1871.   4507.   3369.   3882.]]


In [29]:
from nltk.data import load
tagdict = load('help/tagsets/upenn_tagset.pickle')
pos_list = tagdict.keys()
pos_list

['PRP$',
 'VBG',
 'VBD',
 '``',
 'VBN',
 ',',
 "''",
 'VBP',
 'WDT',
 'JJ',
 'WP',
 'VBZ',
 'DT',
 'RP',
 '$',
 'NN',
 ')',
 '(',
 'FW',
 'POS',
 '.',
 'TO',
 'LS',
 'RB',
 ':',
 'NNS',
 'NNP',
 'VB',
 'WRB',
 'CC',
 'PDT',
 'RBS',
 'RBR',
 'CD',
 'PRP',
 'EX',
 'IN',
 'WP$',
 'MD',
 'NNPS',
 '--',
 'JJS',
 'JJR',
 'SYM',
 'UH']

In [30]:
print(pos_list)

['PRP$', 'VBG', 'VBD', '``', 'VBN', ',', "''", 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', '$', 'NN', ')', '(', 'FW', 'POS', '.', 'TO', 'LS', 'RB', ':', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'PDT', 'RBS', 'RBR', 'CD', 'PRP', 'EX', 'IN', 'WP$', 'MD', 'NNPS', '--', 'JJS', 'JJR', 'SYM', 'UH']


In [31]:
print(fvs_syntax)

[[ 0.13690262  0.14428417  0.06604384  0.11314614  0.07099225  0.07026333]
 [ 0.13769066  0.14249779  0.06660698  0.11294088  0.07073194  0.07022032]
 [ 0.13411373  0.14249154  0.06345326  0.11253802  0.07172778  0.0731279 ]
 [ 0.13435323  0.14385686  0.06547678  0.11310529  0.07033345  0.0726183 ]
 [ 0.13751159  0.08923131  0.04377677  0.10386772  0.07980485  0.09232196]
 [ 0.13812032  0.08628093  0.04400372  0.10418788  0.08172119  0.09263358]
 [ 0.13805402  0.09057892  0.04451742  0.10545431  0.08064025  0.09192728]
 [ 0.13698438  0.09713565  0.04367719  0.10521278  0.07864696  0.09062259]]


In [32]:
def PredictAuthors(fvs):
    km = KMeans(n_clusters=2, init='k-means++', n_init=10, verbose=0)
    km.fit(fvs)
 
    return km

In [33]:
print('Lexical:', PredictAuthors(fvs_lexical).labels_)
# print('Punc:',PredictAuthors(fvs_punct).labels_)
print('Syntax:',PredictAuthors(fvs_syntax).labels_)
lexical_predict = PredictAuthors(fvs_lexical).labels_
syntax_predict = PredictAuthors(fvs_syntax).labels_


('Lexical:', array([1, 1, 1, 1, 0, 0, 0, 0]))
('Syntax:', array([0, 0, 0, 0, 1, 1, 1, 1]))


In [34]:
print('BOW:',PredictAuthors(fvs_bow).labels_)
bow_predict = PredictAuthors(fvs_bow).labels_

('BOW:', array([0, 0, 0, 0, 1, 1, 1, 1]))


In [35]:
true_classification = np.array([0,0,0,0,1,1,1,1])
true_classification = 1-true_classification
print(100*sum(lexical_predict - true_classification)/len(true_classification))
print(100*sum(syntax_predict - true_classification)/len(true_classification))
print(100*sum(bow_predict - true_classification)/len(true_classification))


0
0
0


In [None]:
PredictAuthors(fvs_syntax).cluster_centers_

In [None]:
PredictAuthors(fvs_syntax).predict([[0.12593638,  0.04548909,  0.08663554,  0.09901205, 0.05591141,  0.02073608]])

In [None]:
PredictAuthors(fvs_bow).predict( [[ 0.,          0.,          0.,          0.,          0.,          0.98058068,
   0.,          0.,          0.,          0.19611614]])

In [None]:
PredictAuthors(fvs_bow).cluster_centers_

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

centroids = PredictAuthors(fvs_bow).cluster_centers_

print(centroids)

plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
