In [1]:
import os

In [2]:
CORPUS_PATH = os.path.join('data')

In [3]:
filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in os.listdir(CORPUS_PATH)])

In [4]:
len(filenames)

11

In [5]:
filenames[:11]

['data/Cluster0',
 'data/Cluster1',
 'data/Cluster10',
 'data/Cluster2',
 'data/Cluster3',
 'data/Cluster4',
 'data/Cluster5',
 'data/Cluster6',
 'data/Cluster7',
 'data/Cluster8',
 'data/Cluster9']

In [6]:
import numpy as np
import itertools
import operator

In [7]:
def grouper(n, iterable, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)

In [8]:
doctopic_triples = []
mallet_docnames = []

In [9]:
with open("/tmp/doc-topic.txt") as f:
    f.readline()  # read one line in order to skip the header
    for line in f:
        docnum, docname, *values = line.rstrip().split('\t')
        mallet_docnames.append(docname)
        for topic, share in grouper(2, values):
            triple = (docname, int(topic), float(share))
            doctopic_triples.append(triple)

In [10]:
doctopic_triples = sorted(doctopic_triples, key=operator.itemgetter(0,1))

In [11]:
mallet_docnames = sorted(mallet_docnames)

In [12]:
num_docs = len(mallet_docnames)

In [13]:
num_topics = len(doctopic_triples) // len(mallet_docnames)

In [14]:
doctopic = np.zeros((num_docs, num_topics))

In [15]:
for triple in doctopic_triples:
    docname, topic, share = triple
    row_num = mallet_docnames.index(docname)
    doctopic[row_num, topic] = share

In [16]:
#doctopic = np.zeros((num_docs, num_topics))

In [17]:
#for i, (doc_name, triples) in enumerate(itertools.groupby(doctopic_triples, key=operator.itemgetter(0))):
#    doctopic[i, :] = np.array([share for _, _, share in triples])

In [18]:
novel_names = []

In [19]:
for fn in filenames:
    basename = os.path.basename(fn)
    name, ext = os.path.splitext(basename)
    #name = name.rstrip('0123456789')
    #print (name)
    novel_names.append(name)

Cluster0
Cluster1
Cluster10
Cluster2
Cluster3
Cluster4
Cluster5
Cluster6
Cluster7
Cluster8
Cluster9


In [20]:
novel_names = np.asarray(novel_names)

In [21]:
doctopic_orig = doctopic.copy()

In [22]:
num_groups = len(set(novel_names))

In [24]:
doctopic_grouped = np.zeros((num_groups, num_topics))

In [25]:
for i, name in enumerate(sorted(set(novel_names))):
    doctopic_grouped[i, :] = np.mean(doctopic[novel_names == name, :], axis=0)

In [30]:
doctopic = doctopic_grouped
#print (doctopic)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
CORPUS_PATH_UNSPLIT = os.path.join('data')

In [33]:
filenames = [os.path.join(CORPUS_PATH_UNSPLIT, fn) for fn in sorted(os.listdir(CORPUS_PATH_UNSPLIT))]

In [34]:
vectorizer = CountVectorizer(input='filename')

In [35]:
dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

In [37]:
print (dtm)

  (0, 1574)	1
  (0, 1373)	1
  (0, 2794)	1
  (0, 3818)	1
  (0, 1412)	1
  (0, 4137)	1
  (0, 2076)	2
  (0, 3306)	1
  (0, 1219)	1
  (0, 841)	1
  (0, 3232)	1
  (0, 3555)	1
  (0, 4073)	1
  (0, 2412)	1
  (0, 4096)	1
  (0, 1855)	1
  (0, 1468)	1
  (0, 3652)	1
  (0, 2281)	1
  (0, 2224)	1
  (0, 2688)	1
  (0, 1614)	1
  (0, 546)	1
  (0, 3547)	1
  (0, 1146)	2
  :	:
  (10, 2795)	3
  (10, 2240)	8
  (10, 2236)	28
  (10, 2025)	1
  (10, 390)	97
  (10, 857)	1
  (10, 3959)	47
  (10, 785)	1
  (10, 2620)	4
  (10, 460)	18
  (10, 3909)	19
  (10, 1717)	1
  (10, 3828)	1
  (10, 3266)	1
  (10, 2035)	46
  (10, 1111)	2
  (10, 1691)	35
  (10, 561)	9
  (10, 1890)	10
  (10, 4107)	7
  (10, 1824)	13
  (10, 2790)	15
  (10, 2772)	107
  (10, 2008)	1
  (10, 3910)	148


In [38]:
dtm.shape

(11, 4296)

In [39]:
dtm.data.nbytes    # number of bytes dtm takes up

81056

In [40]:
dtm.toarray().data.nbytes      # number of bytes dtm as array takes up

378048

In [41]:
doctopic_orig.shape

(11, 20)

In [42]:
doctopic_orig.data.nbytes  # number of bytes document-topic shares take up

1760

In [43]:
novels = sorted(set(novel_names))

In [44]:
print("Top topics in...")

Top topics in...


In [45]:
for i in range(len(doctopic)):
    top_topics = np.argsort(doctopic[i,:])[::-1][0:3]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(novels[i], top_topics_str))

Cluster0: 19 1 18
Cluster1: 16 6 14
Cluster10: 10 13 1
Cluster2: 15 4 14
Cluster3: 17 6 14
Cluster4: 7 0 2
Cluster5: 2 6 1
Cluster6: 5 13 18
Cluster7: 9 0 13
Cluster8: 12 0 4
Cluster9: 8 11 4


In [46]:
with open('/tmp/topic-keys.txt') as input:
    topic_keys_lines = input.readlines()

In [47]:
topic_words = []

In [48]:
for line in topic_keys_lines:
    _, _, words = line.split('\t')  # tab-separated
    words = words.rstrip().split(' ')  # remove the trailing '\n'
    topic_words.append(words)

In [49]:
topic_words[0]

['public',
 'management',
 'systems',
 'sector',
 'paper',
 'organizational',
 'case',
 'change',
 'processes',
 'success',
 'organization',
 'service',
 'social',
 'human',
 'agencies',
 'purpose',
 'integrated',
 'strategy',
 'developing']

In [50]:
N_WORDS_DISPLAY = 19

In [51]:
for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ' '.join(topic_words[t][:N_WORDS_DISPLAY])))

Topic 0: public management systems sector paper organizational case change processes success organization service social human agencies purpose integrated strategy developing
Topic 1: information adoption influence findings related survey show media explain experience states suggest found issues behavior national community growing addition
Topic 2: performance transparency participation studies capability managerial bureaucrats municipal city greater increased eas consistent countries initiatives measurement programs higher evolution
Topic 3: abstract study level future citizen implications practice characteristics structure policy perspective technological concept include set years general power nature
Topic 4: information research framework system development theoretical implementation institutional support challenges organizations important proposed develop theories effective building identified corporate
Topic 5: development countries governance developing economic innovation infra

In [44]:
austen_indices, cbronte_indices = [], []

In [45]:
for index, fn in enumerate(sorted(set(novel_names))):
    if "Austen" in fn:
        austen_indices.append(index)
    elif "CBronte" in fn:
        cbronte_indices.append(index)

In [46]:
austen_avg = np.mean(doctopic[austen_indices, :], axis=0)

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [47]:
cbronte_avg = np.mean(doctopic[cbronte_indices, :], axis=0)

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


In [48]:
keyness = np.abs(austen_avg - cbronte_avg)

In [49]:
ranking = np.argsort(keyness)[::-1]  # from highest to lowest; [::-1] reverses order in Python sequences

In [50]:
ranking[:10]

array([19, 18,  1,  2,  3,  4,  5,  6,  7,  8])