# Configuration

In [363]:
corpus_db = '../2019-02-28_Lab07/novels.db'
max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100

#  Libraries

In [378]:
import pandas as pd
import sqlite3
import textman as tx

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [365]:
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    AND term_str NOT IN ('said')
    ORDER BY tfidf_sum DESC LIMIT {}
)
-- AND (author = 'poe' OR author = 'austen') 
AND (pos NOT LIKE 'NNP%')
""".format(max_words)

In [366]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql, db)

## Fix tokens dataframe

In [367]:
tokens = tokens.set_index(['author','book','chapter'])

In [368]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,genre,para_num,sent_num,token_num,pos,token_str,punc,num,term_str,term_id
author,book,chapter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
christie,secretadversary,1,d,1,0,0,JJ,"“TOMMY,",0,0,tommy,24529
christie,secretadversary,1,d,1,0,1,JJ,old,0,0,old,16509
christie,secretadversary,1,d,1,0,2,NN,thing!”,0,0,thing,24202
christie,secretadversary,1,d,2,0,0,JJ,"“Tuppence,",0,0,tuppence,25026
christie,secretadversary,1,d,2,0,1,JJ,old,0,0,old,16509


In [369]:
len(tokens.term_str.unique())

9606

## Convert tokens to a corpus for MALLET input

In [370]:
corpus = tx.gather_tokens(tokens, level=2, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})
corpus['doc_label'] = corpus.apply(lambda x: "doyle-{}-{}".format(x.book, x.chapter), 1)

In [371]:
corpus.head()

Unnamed: 0,author,book,chapter,doc_content,doc_label
0,austen,northangerabbey,1,seen infancy supposed born heroine situation l...,doyle-northangerabbey-1
1,austen,northangerabbey,2,addition already personal mental difficulties ...,doyle-northangerabbey-2
2,austen,northangerabbey,3,morning brought regular duties shops visited n...,doyle-northangerabbey-3
3,austen,northangerabbey,4,usual eagerness hasten pump room next day secu...,doyle-northangerabbey-4
4,austen,northangerabbey,5,engaged theatre evening returning smiles certa...,doyle-northangerabbey-5


## Dump corpus to CSV file

In [372]:
corpus[['doc_label','doc_content']].to_csv('novels-corpus.csv', index=False)

## MALLET Time

### Show MALLET options

In [373]:
!mallet 

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data into testing, tr

### Import corpus

In [375]:
!mallet import-file --input novels-corpus.csv --output novels-corpus.mallet --keep-sequence TRUE

### Train topics

In [377]:
!mallet train-topics --input novels-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics novels-doc-topics.txt \
--output-topic-keys novels-topic-keys.txt \
--word-topic-counts-file novels-word-topic-counts-file.txt \
--topic-word-weights-file novels-topic-word-weights-file.txt \
--xml-topic-report novels-topic-report.xml \
--xml-topic-phrase-report novels-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file novels-diagnostics.xml


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 7089
total tokens: 526511
<10> LL/token: -9.53923
<20> LL/token: -9.09263
<30> LL/token: -8.947
<40> LL/token: -8.87387
<50> LL/token: -8.83091
<60> LL/token: -8.80362
<70> LL/token: -8.78691
<80> LL/token: -8.77197
<90> LL/token: -8.75831

0	0.25	found great young part thus body brought let make death character within account several certain opinion nothing murder bring affair 
1	0.25	know dont tell got think get girl come want yes head good going sure thing asked right something young cant 
2	0.25	good round look take anything back got name right boy things later man morning found suddenly four evening find returned 
3	0.25	heart hope returned left happiness give love return conduct affection passed till manner saw leave told spoke father exclaimed situation 
4	0.25	mountains whose along sun scene moon sea air little often light gave wild eye fire appeared beneath journey green around 
5	0.25	light night o