# Configuration

In [2]:
corpus_db = 'HarryPotter.db'
max_words = 10000

# For MALLET
num_topics = 20
num_iters = 1000
show_interval = 100

#  Libraries

In [3]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Process

## Import novel corpus from database

We use SQL to get what we want quickly.

In [4]:
sql = """
SELECT * FROM token 
""".format(max_words)

In [None]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql, db)

In [None]:
tokens.head()

## Fix tokens dataframe

In [6]:
tokens = tokens.set_index(['book_num','chap_num'])

In [7]:
tokens.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,para_num,sent_num,token_num,pos,token_str,punc,num,term_str,term_id
book_num,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,NN,CHAPTER,0,0,chapter,4164
0,0,0,0,1,CD,ONE,0,0,one,14357
0,0,1,0,0,DT,THE,0,0,the,20883


In [8]:
len(tokens.term_str.unique())

23717

## Convert tokens to a corpus for MALLET input

In [9]:
tokens = tokens[tokens.term_id != -1].reset_index()
tokens.head()

Unnamed: 0,book_num,chap_num,para_num,sent_num,token_num,pos,token_str,punc,num,term_str,term_id
0,0,0,0,0,0,NN,CHAPTER,0,0,chapter,4164
1,0,0,0,0,1,CD,ONE,0,0,one,14357
2,0,0,1,0,0,DT,THE,0,0,the,20883
3,0,0,1,0,1,NNP,BOY,0,0,boy,3351
4,0,0,1,0,2,NNP,WHO,0,0,who,23170


In [10]:
tokens['book_chap_num'] = tokens['book_num'].map(str) +'_'+tokens['chap_num'].map(str)
tokens = tokens.set_index('book_chap_num')

In [11]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [12]:
corpus.shape

(199, 2)

In [13]:
## Remove insignificant words

In [14]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(corpus.doc_content.values.tolist())
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

## Dump corpus to CSV file

In [15]:
corpus.to_csv('novels-corpus.csv', index=False)

In [16]:
test1 = pd.read_csv("novels-corpus.csv")
test1.head()

Unnamed: 0,book_chap_num,doc_content
0,0_0,chapter one the boy who lived mr and mrs dursl...
1,0_1,chapter two the vanishing glass nearly ten yea...
2,0_10,chapter eleven quidditch as they entered novem...
3,0_11,chapter twelve the mirror of erised christmas ...
4,0_12,chapter thirteen nicolas flamel dumbledore had...


## MALLET Time

In [11]:
path = './mallet-2.0.8/bin/mallet'


### Show MALLET options

In [12]:
!{path}

Unrecognized command: 
Mallet 2.0 commands: 

  import-dir         load the contents of a directory into mallet instances (one per file)
  import-file        load a single file into mallet instances (one per line)
  import-svmlight    load SVMLight format data files into Mallet instances
  info               get information about Mallet instances
  train-classifier   train a classifier from Mallet data files
  classify-dir       classify data from a single file with a saved classifier
  classify-file      classify the contents of a directory with a saved classifier
  classify-svmlight  classify data from a single file in SVMLight format
  train-topics       train a topic model from Mallet data files
  infer-topics       use a trained topic model to infer topics for new documents
  evaluate-topics    estimate the probability of new documents under a trained model
  prune              remove features based on frequency or information gain
  split              divide data i

In [None]:
# !{path} train-topics --config {mallet_config_file}

### Import corpus

In [37]:
# !{path} import-file --input novels-corpus.csv --output novels-corpus.mallet --keep-sequence TRUE

### Train topics

In [13]:
!{path} train-topics --input novels-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics novels-doc-topics.txt \
--output-topic-keys novels-topic-keys.txt \
--word-topic-counts-file novels-word-topic-counts-file.txt \
--topic-word-weights-file novels-topic-word-weights-file.txt \
--xml-topic-report novels-topic-report.xml \
--xml-topic-phrase-report novels-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file novels-diagnostics.xml


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 7303
total tokens: 892340
<10> LL/token: -9.2266
<20> LL/token: -8.81185
<30> LL/token: -8.63769
<40> LL/token: -8.53073
<50> LL/token: -8.45448
<60> LL/token: -8.40038
<70> LL/token: -8.35818
<80> LL/token: -8.32151
<90> LL/token: -8.28618

0	0.25	harry said you the and malfoy neville him what over back all had here out going get just into this 
1	0.25	the hermione ron they harry had and them were from out back that was but thought what around how two 
2	0.25	the been and was harry for that know said his can you there here this out think have right hogwarts 
3	0.25	dumbledore you that said have not but harry for the and are your did voldemort this yes n't slughorn will 
4	0.25	said she you her and with ron harry for well just that not who hermione what was the table like 
5	0.25	had the his and that was not one when would about for even harry some say little with most after 
6	0.25	the and harry they was on

[beta: 0.01955] 
<400> LL/token: -7.83153
<410> LL/token: -7.82971
<420> LL/token: -7.82282
<430> LL/token: -7.81572
<440> LL/token: -7.81783
<450> LL/token: -7.81196
<460> LL/token: -7.80935
<470> LL/token: -7.80236
<480> LL/token: -7.80863
<490> LL/token: -7.8026

0	0.2663	harry malfoy slughorn dobby lockhart ginny said didn sir don potter potion riddle myrtle into filch nick slytherin chamber crabbe 
1	0.39798	ron hermione harry said the they that them had what dumbledore think but going get around about out then yeah 
2	0.49278	you but what and there know this out back they how been have can all looked the told was think 
3	0.27774	dumbledore you said harry have not voldemort that but will your more yes are think professor has lord never riddle 
4	0.46724	said you she her and with well just looking are the like your not who down got them more asked 
5	0.52817	the that and was had his for him not would when with were been have now who could from about 
6	0.50047	the and harry they t

[beta: 0.02183] 
<700> LL/token: -7.75228
<710> LL/token: -7.75211
<720> LL/token: -7.74749
<730> LL/token: -7.74952
<740> LL/token: -7.74396
<750> LL/token: -7.74374
<760> LL/token: -7.74068
<770> LL/token: -7.73668
<780> LL/token: -7.73225
<790> LL/token: -7.72954

0	0.11027	malfoy harry slughorn lockhart ginny dobby ron potter didn don sir riddle snape very myrtle potion filch school chamber book 
1	0.45167	ron hermione they them said the think around that just dumbledore about their harry off get got what going yeah 
2	0.83417	you harry said but what and was there know out have back then how all looked into been room they 
3	0.26447	dumbledore you have not that voldemort said but your will professor yes more did has riddle very are think than 
4	0.70425	said she her and with you well looking your harry who not asked the look down just over all face 
5	0.94362	the that had his was and for him not would when with who were been now from have about could 
6	0.83859	the and harry they h

[beta: 0.02253] 
<1000> LL/token: -7.70282

Total time: 1 minutes 9 seconds
