# Creating Token Table
Leah Hogenmiller (lmh2ur)

## Set Up

In [44]:
import pandas as pd
import numpy as np
import os 
from glob import glob
import re
import nltk
from textparser import TextParser

In [45]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]

## Chunking Patterns

In [46]:
#Define chunking for each novel
ohco_pat_list = [
    (244, rf"^\s*CHAPTER\s+[IVXLCM]+\.+\s*$"),
    (834, rf"^\s*Chapter\s+[IVXLCM]+\."),
    (863, rf"^\s*CHAPTER\s+[IVXLCM]+\.+\s*$"),
    (1155, rf"^\s*(?:CHAPTER|PROLOGUE)+"),
    (1661, rf"^\s*Chapter\s+[IVXLCM]+\."),
    (2097, rf"^\s*Chapter\s+[IVXLCM]+\s*$"),
    (3070, rf"^\s*Chapter\s+\d+$"),
    (3289, rf"^\s*Chapter\s+\d+--"),
    (58866, rf"^\s*Chapter\s+"),
    (61168, rf"^\s*(?:CHAPTER|PROLOGUE)+"),
    (65238, rf"^\s*Chapter\s+"),
    (69087, rf"^\s*CHAPTER\s+[IVXLCM]+\s*$"),
    (70114, rf"^\s*Chapter\s+\d+.")
]

## LIB Table

In [47]:
LIB = pd.read_csv('LIB.csv').set_index('book_id')

In [48]:
LIB

Unnamed: 0_level_0,title,author,date,source_file_path
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
863,The Mysterious Affair at Styles,Agatha Christie,1920,pg863.txt
1155,The Secret Adversary,Agatha Christie,1922,pg1155.txt
58866,The Murder on the Links,Agatha Christie,1923,pg58866.txt
61168,The Man in the Brown Suit,Agatha Christie,1924,pg61168.txt
65238,The Secret of Chimneys,Agatha Christie,1925,pg65238.txt
69087,The Murder of Roger Ackroyd,Agatha Christie,1926,pg69087.txt
70114,The Big Four,Agatha Christie,1927,pg70114.txt
244,A Study in Scarlet,Arthur Conan Doyle,1887,pg244.txt
834,The Memoirs of Sherlock Holmes,Arthur Conan Doyle,1893,pg834.txt
1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,1892,pg1661.txt


## Save Chapter Regexes

In [49]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [50]:
LIB

Unnamed: 0_level_0,title,author,date,source_file_path,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
863,The Mysterious Affair at Styles,Agatha Christie,1920,pg863.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$
1155,The Secret Adversary,Agatha Christie,1922,pg1155.txt,^\s*(?:CHAPTER|PROLOGUE)+
58866,The Murder on the Links,Agatha Christie,1923,pg58866.txt,^\s*Chapter\s+
61168,The Man in the Brown Suit,Agatha Christie,1924,pg61168.txt,^\s*(?:CHAPTER|PROLOGUE)+
65238,The Secret of Chimneys,Agatha Christie,1925,pg65238.txt,^\s*Chapter\s+
69087,The Murder of Roger Ackroyd,Agatha Christie,1926,pg69087.txt,^\s*CHAPTER\s+[IVXLCM]+\s*$
70114,The Big Four,Agatha Christie,1927,pg70114.txt,^\s*Chapter\s+\d+.
244,A Study in Scarlet,Arthur Conan Doyle,1887,pg244.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$
834,The Memoirs of Sherlock Holmes,Arthur Conan Doyle,1893,pg834.txt,^\s*Chapter\s+[IVXLCM]+\.
1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,1892,pg1661.txt,^\s*Chapter\s+[IVXLCM]+\.


## Tokenize Corpus

In [51]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [52]:
CORPUS = tokenize_collection(LIB)

Tokenizing 863 The Mysterious Affair at Styles
Importing  pg863.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s+[IVXLCM]+\.+\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 1155 The Secret Adversary
Importing  pg1155.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*(?:CHAPTER|PROLOGUE)+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 58866 The Murder on the Links
Importing  pg58866.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*Chapter\s+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK

In [53]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [81]:
LIB

Unnamed: 0_level_0,title,author,date,source_file_path,chap_regex,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
863,The Mysterious Affair at Styles,Agatha Christie,1920,pg863.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$,56672
1155,The Secret Adversary,Agatha Christie,1922,pg1155.txt,^\s*(?:CHAPTER|PROLOGUE)+,75864
58866,The Murder on the Links,Agatha Christie,1923,pg58866.txt,^\s*Chapter\s+,64531
61168,The Man in the Brown Suit,Agatha Christie,1924,pg61168.txt,^\s*(?:CHAPTER|PROLOGUE)+,75417
65238,The Secret of Chimneys,Agatha Christie,1925,pg65238.txt,^\s*Chapter\s+,74656
69087,The Murder of Roger Ackroyd,Agatha Christie,1926,pg69087.txt,^\s*CHAPTER\s+[IVXLCM]+\s*$,69753
70114,The Big Four,Agatha Christie,1927,pg70114.txt,^\s*Chapter\s+\d+.,56052
244,A Study in Scarlet,Arthur Conan Doyle,1887,pg244.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$,43522
834,The Memoirs of Sherlock Holmes,Arthur Conan Doyle,1893,pg834.txt,^\s*Chapter\s+[IVXLCM]+\.,96012
1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,1892,pg1661.txt,^\s*Chapter\s+[IVXLCM]+\.,105045


In [55]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [100]:
LIB

Unnamed: 0_level_0,title,author,date,source_file_path,chap_regex,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
863,The Mysterious Affair at Styles,Agatha Christie,1920,pg863.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$,56672
1155,The Secret Adversary,Agatha Christie,1922,pg1155.txt,^\s*(?:CHAPTER|PROLOGUE)+,75864
58866,The Murder on the Links,Agatha Christie,1923,pg58866.txt,^\s*Chapter\s+,64531
61168,The Man in the Brown Suit,Agatha Christie,1924,pg61168.txt,^\s*(?:CHAPTER|PROLOGUE)+,75417
65238,The Secret of Chimneys,Agatha Christie,1925,pg65238.txt,^\s*Chapter\s+,74656
69087,The Murder of Roger Ackroyd,Agatha Christie,1926,pg69087.txt,^\s*CHAPTER\s+[IVXLCM]+\s*$,69753
70114,The Big Four,Agatha Christie,1927,pg70114.txt,^\s*Chapter\s+\d+.,56052
244,A Study in Scarlet,Arthur Conan Doyle,1887,pg244.txt,^\s*CHAPTER\s+[IVXLCM]+\.+\s*$,43522
834,The Memoirs of Sherlock Holmes,Arthur Conan Doyle,1893,pg834.txt,^\s*Chapter\s+[IVXLCM]+\.,96012
1661,The Adventures of Sherlock Holmes,Arthur Conan Doyle,1892,pg1661.txt,^\s*Chapter\s+[IVXLCM]+\.,105045


## VOCAB Table

In [57]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [58]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1).str[:2]

In [59]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,31,1,0.000035,14.790204,CD
10,7,2,0.000008,16.937045,CD
100,5,3,0.000006,17.422472,CD
1000,9,4,0.000010,16.574475,CD
100000,1,6,0.000001,19.744400,NN
...,...,...,...,...,...
éliseif,1,7,0.000001,19.744400,NN
élises,1,6,0.000001,19.744400,JJ
émigrés,1,7,0.000001,19.744400,NN
épatant,1,7,0.000001,19.744400,JJ


## BOW Table

In [60]:
BOW = CORPUS.groupby(CHAPS+['term_str']).term_str.count().to_frame('n') 

In [61]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
244,1,1878,1
244,1,a,87
244,1,able,2
244,1,about,9
244,1,absorbed,1


## DTCM Table

In [62]:
DTCM = BOW.n.unstack().fillna(0).astype('int')

In [63]:
DTCM.head()

Unnamed: 0_level_0,term_str,1,10,100,1000,100000,1015,1019,1023,1030,1040,...,zum,à,æsthetic,ça,élise,éliseif,élises,émigrés,épatant,épouvantable
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
244,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244,2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
244,5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Compute TFIDF

### Compute TF

In [64]:
tf_method = 'sum'
print('TF method:', tf_method)
if tf_method == 'sum':
    TF = DTCM.T / DTCM.T.sum()
elif tf_method == 'max':
    TF = DTCM.T / DTCM.T.max()
elif tf_method == 'log':
    TF = np.log2(1 + DTCM.T)
elif tf_method == 'raw':
    TF = DTCM.T
elif tf_method == 'double_norm':
    TF = DTCM.T / DTCM.T.max()
elif tf_method == 'binary':
    TF = DTCM.T.astype('bool').astype('int')
TF = TF.T

TF method: sum


In [65]:
TF.head()

Unnamed: 0_level_0,term_str,1,10,100,1000,100000,1015,1019,1023,1030,1040,...,zum,à,æsthetic,ça,élise,éliseif,élises,émigrés,épatant,épouvantable
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
244,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,2,0.000281,0.000281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compute DF

In [66]:
DF = DTCM.astype('bool').sum()

In [67]:
DF

term_str
1               22
10               5
100              4
1000             5
100000           1
                ..
éliseif          1
élises           1
émigrés          1
épatant          1
épouvantable     1
Length: 25084, dtype: int64

### Compute IDF

In [68]:
N = DTCM.shape[0]
idf_method = 'standard'

print('IDF method:', idf_method)
if idf_method == 'standard':
    IDF = np.log2(N / DF)
elif idf_method == 'max':
    IDF = np.log2(DF.max() / DF) 
elif idf_method == 'smooth':
    IDF = np.log2((1 + N) / (1 + DF)) + 1

IDF method: standard


In [69]:
IDF

term_str
1               3.568474
10              5.705978
100             6.027906
1000            5.705978
100000          8.027906
                  ...   
éliseif         8.027906
élises          8.027906
émigrés         8.027906
épatant         8.027906
épouvantable    8.027906
Length: 25084, dtype: float64

### Compute TFIDF

In [70]:
TFIDF = TF * IDF

In [71]:
TFIDF.head()

Unnamed: 0_level_0,term_str,1,10,100,1000,100000,1015,1019,1023,1030,1040,...,zum,à,æsthetic,ça,élise,éliseif,élises,émigrés,épatant,épouvantable
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
244,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,2,0.001002,0.001601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [73]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tf,tfidf
book_id,chap_id,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
244,1,1878,1,0.000359,0.002311
244,1,a,87,0.031205,0.000000
244,1,able,2,0.000717,0.000539
244,1,about,9,0.003228,0.000072
244,1,absorbed,1,0.000359,0.001235
...,...,...,...,...,...
70114,18,your,14,0.004610,0.000313
70114,18,yours,1,0.000329,0.000461
70114,18,yourself,2,0.000659,0.000539
70114,18,youth,1,0.000329,0.000915


In [74]:
BOW.to_csv('BOW.csv')

### Compute DFIDF

In [75]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF

In [76]:
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

In [77]:
VOCAB['tfidf_mean'] = TFIDF.mean() 
VOCAB['tfidf_sum'] = TFIDF.sum()
VOCAB['tfidf_median'] = TFIDF.median()
VOCAB['tfidf_max'] = TFIDF.max()

In [78]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,df,idf,dfidf,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,31,1,3.5e-05,14.790204,CD,22,3.568474,78.506436,0.000169,0.044195,0.0,0.007534
10,7,2,8e-06,16.937045,CD,5,5.705978,28.52989,3.2e-05,0.00839,0.0,0.003838
100,5,3,6e-06,17.422472,CD,4,6.027906,24.111624,2.3e-05,0.006123,0.0,0.002503
1000,9,4,1e-05,16.574475,CD,5,5.705978,28.52989,2.1e-05,0.005444,0.0,0.002935
100000,1,6,1e-06,19.7444,NN,1,8.027906,8.027906,1.6e-05,0.004234,0.0,0.004234


In [79]:
VOCAB.to_csv('VOCAB.csv')