In [1]:
import pandas as pd
from textparser import TextParser
import os
import re
import numpy as np
import nltk
from glob import glob
from nltk.stem.porter import PorterStemmer
from numpy.linalg import norm

In [2]:
os.getcwd()

'/Users/michaelhammer/Desktop/ETA_Final'

# Parsing Data

In [3]:
source_files = '/Users/michaelhammer/Desktop/ETA_Final/novels'
output_dir = '/Users/michaelhammer/Desktop/ETA_Final/output'

In [4]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENT = OHCO[:4]
PARA = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]

In [5]:
clip_pats = [
    r"\s*START OF BOOK",
    r"\s*THE END"
]

ohco_pat_list = [
    (1,  rf"^CHAPTER\s"),
    (2,   rf"^CHAPTER\s"),
    (3,   rf"^CHAPTER\s"),
    (4,   rf"^CHAPTER\s"),
    (5,   rf"^CHAPTER\s"),
    (6,   rf"^CHAPTER\s"),
    (7,   rf"^CHAPTER\s|^EPILOGUE\s")
]

In [6]:
source_file_list = sorted(glob(f"{source_files}/*.*"))
source_file_list

["/Users/michaelhammer/Desktop/ETA_Final/novels/Book1_The_Sorcerer's_Stone.txt",
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book2_The_Chamber_of_Secrets.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book3_The_Prisoner_of_Azkaban.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book4_The_Goblet_of_Fire.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book5_The_Order_Of_The_Phoenix.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book6_The_Half_Blood_Prince.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book7_The_Deathly_Hallows.txt']

In [7]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('/')[-1].split('_')[0].replace('Book',''))
    book_title = 'Harry Potter And ' + source_file_path.split('/')[-1].split('-')[0].replace('_', ' ')[6:].split('.')[0]
    book_data.append((book_id, source_file_path, book_title))
book_data 

[(1,
  "/Users/michaelhammer/Desktop/ETA_Final/novels/Book1_The_Sorcerer's_Stone.txt",
  "Harry Potter And The Sorcerer's Stone"),
 (2,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book2_The_Chamber_of_Secrets.txt',
  'Harry Potter And The Chamber of Secrets'),
 (3,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book3_The_Prisoner_of_Azkaban.txt',
  'Harry Potter And The Prisoner of Azkaban'),
 (4,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book4_The_Goblet_of_Fire.txt',
  'Harry Potter And The Goblet of Fire'),
 (5,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book5_The_Order_Of_The_Phoenix.txt',
  'Harry Potter And The Order Of The Phoenix'),
 (6,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book6_The_Half_Blood_Prince.txt',
  'Harry Potter And The Half Blood Prince'),
 (7,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book7_The_Deathly_Hallows.txt',
  'Harry Potter And The Deathly Hallows')]

In [8]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','title'])\
    .set_index('book_id').sort_index()
LIB

Unnamed: 0_level_0,source_file_path,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire
5,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Order Of The Phoenix
6,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Half Blood Prince
7,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Deathly Hallows


In [9]:
books = []
for pat in ohco_pat_list:
    
    book_id, chap_regex = pat
    print("Tokenizing", book_id, LIB.loc[book_id].title)
    ohco_pats = [('chap', chap_regex, 'm')]
    src_file_path = LIB.loc[book_id].source_file_path
    
    text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
    text.verbose = False
    text.strip_hyphens = True
    text.strip_whitespace = True
    text.import_source().parse_tokens();
    text.TOKENS['book_id'] = book_id
    text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)
    
    books.append(text.TOKENS)

Tokenizing 1 Harry Potter And The Sorcerer's Stone
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 2 Harry Potter And The Chamber of Secrets
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 3 Harry Potter And The Prisoner of Azkaban
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 4 Harry Potter And The Goblet of Fire
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 5 Harry Potter And The Order Of The Phoenix
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 6 Harry Potter And The Half Blood Prince
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 7 Harry Potter And The Deathly Hallows
line_str chap_str
Index(['chap_str'], dtype='object')


In [10]:
CORPUS = pd.concat(books).sort_index()
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,0,0,"(Mr., NNP)",NNP,Mr.,mr
1,1,1,0,1,"(and, CC)",CC,and,and
1,1,1,0,2,"(Mrs., NNP)",NNP,Mrs.,mrs
1,1,1,0,3,"(Dursley,, NNP)",NNP,"Dursley,",dursley
1,1,1,0,4,"(of, IN)",IN,of,of
...,...,...,...,...,...,...,...,...
7,36,285,0,7,"(nineteen, JJ)",JJ,nineteen,nineteen
7,36,285,0,8,"(years., NN)",NN,years.,years
7,36,285,1,0,"(All, DT)",DT,All,all
7,36,285,1,1,"(was, VBD)",VBD,was,was


## Add to LIB

In [11]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [12]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

In [13]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [14]:
LIB.sort_values('book_len')

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s
6,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Half Blood Prince,172720,30,^CHAPTER\s
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s
7,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Deathly Hallows,201393,36,^CHAPTER\s|^EPILOGUE\s
5,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Order Of The Phoenix,263038,38,^CHAPTER\s


In [15]:
LIB['release_year'] = [1997, 1998, 1999, 2000, 2003, 2005, 2007]
LIB

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex,release_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s,1997
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s,1998
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s,1999
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s,2000
5,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Order Of The Phoenix,263038,38,^CHAPTER\s,2003
6,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Half Blood Prince,172720,30,^CHAPTER\s,2005
7,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Deathly Hallows,201393,36,^CHAPTER\s|^EPILOGUE\s,2007


## VOCAB Table

In [16]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

token_str
—                    3751
…                    3331
—”                   1985
…”                   1507
"                     810
?”                    218
."                    187
“                     179
“—                    170
”                     113
..."                  105
“…                     90
..                     62
?"                     41
*                      27
…’                     21
"...                   19
!”                     16
—’                     12
.."                    10
"),                     7
".                      7
.'                      6
!"                      4
‘—                      2
....                    2
‘…                      2
‘                       2
.'"                     2
'"                      2
**                      2
",                      1
(?)                     1
–                       1
—————————————————       1
";                      1
/                       1
'.                      1
"'

In [17]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [18]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [19]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [20]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CORPUS['pos_group'] = CORPUS.pos.str[:2]


In [21]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [22]:
TPM = CORPUS[['term_str','pos']].value_counts().unstack()

In [23]:
VOCAB['n_pos'] = TPM.count(1)

In [24]:
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [25]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [26]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [27]:
stemmer1 = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [28]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,stop,porter_stem
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,6,1,5.516460e-06,17.467826,CD,CD,2,"{JJ, CD}",0,0
1,12,1,1.103292e-05,16.467826,CD,CD,1,{CD},0,1
10,1,2,9.194100e-07,20.052788,CD,CD,1,{CD},0,10
11,4,2,3.677640e-06,18.052788,CD,CD,1,{CD},0,11
12,1,2,9.194100e-07,20.052788,CD,CD,1,{CD},0,12
...,...,...,...,...,...,...,...,...,...,...
zoological,1,10,9.194100e-07,20.052788,JJ,JJ,1,{JJ},0,zoolog
zoom,8,4,7.355280e-06,17.052788,NN,NN,2,"{NN, VB}",0,zoom
zoomed,57,6,5.240637e-05,14.219898,VBD,VB,2,"{VBN, VBD}",0,zoom
zooming,31,7,2.850171e-05,15.098592,VBG,VB,2,"{NN, VBG}",0,zoom


# Derived Tables

## BOW Table

In [29]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [30]:
bag = CHAP
bag

['book_id', 'chap_id']

In [31]:
BOW = create_bow(CORPUS, bag)
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
1,1,a,112
1,1,able,2
1,1,about,14
1,1,above,1
1,1,across,2
...,...,...,...
7,36,your,23
7,36,youre,4
7,36,yours,3
7,36,youve,5


# DTM

In [32]:
DTM = BOW.n.unstack(fill_value=0)
DTM

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,6,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
7,33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TFIDF & DFIDF

In [33]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

In [34]:
tf_method = 'max'

In [35]:
TFIDF, DFIDF = get_tfidf(BOW, tf_method)
TFIDF

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,2,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.187645,0.0,0.0,0.000000,0.000000,0.0
1,3,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,4,0.00000,0.035675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,5,0.04109,0.032551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,32,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.006542,0.008202,0.0
7,33,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
7,34,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
7,35,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0


In [36]:
VOCAB['dfidf'] = DFIDF

In [37]:
VOCAB.dfidf.sort_values(ascending=False).head(20)

term_str
information    105.08584
fall           105.08584
itll           105.08584
whisper        105.08584
twice          105.08584
lupin          105.08584
send           105.08584
living         105.08584
fat            105.08584
pressed        105.08584
forget         105.08584
join           105.08584
pretty         105.08584
distant        105.08584
forest         105.08584
letter         105.08584
touch          105.08584
hoping         105.08584
sideways       105.08584
definitely     105.08584
Name: dfidf, dtype: float64

In [38]:
VOCAB['df'] = DTM.astype('bool').sum()
VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)

In [40]:
TFIDF

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,2,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.187645,0.0,0.0,0.000000,0.000000,0.0
1,3,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,4,0.00000,0.035675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
1,5,0.04109,0.032551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,32,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.006542,0.008202,0.0
7,33,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
7,34,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0
7,35,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0


# TFIDF - L2 Normalized

In [41]:
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = "NN NNS VB VBD VBG VBN VBP VBZ JJ JJR JJS RB RBR RBS".split() # Open categories with no proper nouns

In [42]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [43]:
M = TFIDF[VIDX].fillna(0).groupby('book_id').mean()

In [44]:
M

term_str,letter,send,pressed,join,living,whisper,itll,information,distant,fall,...,search,nowhere,yell,armchair,children,cracked,including,staggered,lap,chosen
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.019019,0.008422,0.002558,0.002356,0.004988,0.003336,0.003313,0.00058,0.001271,0.004616,...,0.002039,0.001867,0.002982,0.001492,0.000982,0.001379,0.000528,0.000604,0.002366,0.002406
2,0.007485,0.001634,0.000782,0.002395,0.002263,0.003063,0.006667,0.001377,0.002681,0.001981,...,0.000378,0.00088,0.000971,0.001104,0.000482,0.00258,0.001047,0.0,0.001901,0.000522
3,0.006874,0.003288,0.002105,0.00348,0.00329,0.002821,0.002422,0.002608,0.002667,0.004231,...,0.003385,0.002356,0.001664,0.003321,0.003593,0.00099,0.001619,0.00238,0.002657,0.000659
4,0.010839,0.006021,0.00302,0.003217,0.006293,0.001807,0.001162,0.004602,0.001963,0.003213,...,0.001724,0.001961,0.002039,0.003009,0.002432,0.001667,0.003525,0.002928,0.002149,0.002261
5,0.008324,0.001913,0.00236,0.00401,0.002553,0.002872,0.002747,0.004314,0.002559,0.001947,...,0.001384,0.002234,0.002054,0.003365,0.002405,0.001909,0.003215,0.001525,0.002345,0.002005
6,0.002074,0.003219,0.004269,0.003694,0.003694,0.003694,0.002705,0.005259,0.003687,0.002719,...,0.00217,0.002196,0.002339,0.005359,0.00353,0.002615,0.003553,0.004453,0.003407,0.012424
7,0.004277,0.002216,0.003091,0.002781,0.007333,0.002011,0.002707,0.003564,0.003309,0.002062,...,0.002898,0.003015,0.001887,0.001616,0.003675,0.004461,0.00169,0.002643,0.001519,0.001756


In [45]:
L2 = M.apply(lambda x: x / norm(x), 1)
L2

term_str,letter,send,pressed,join,living,whisper,itll,information,distant,fall,...,search,nowhere,yell,armchair,children,cracked,including,staggered,lap,chosen
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.119315,0.052836,0.016048,0.014781,0.03129,0.020927,0.020783,0.003639,0.007975,0.028958,...,0.012793,0.011713,0.01871,0.009358,0.006162,0.008649,0.003315,0.003787,0.014846,0.015092
2,0.059842,0.013064,0.006255,0.019148,0.018095,0.024488,0.053297,0.011008,0.021437,0.015833,...,0.00302,0.007036,0.007763,0.008826,0.003851,0.020627,0.008366,0.0,0.015201,0.004175
3,0.054056,0.025854,0.016552,0.027367,0.02587,0.022182,0.019049,0.020508,0.020969,0.033272,...,0.02662,0.018525,0.013083,0.026116,0.028253,0.007782,0.012734,0.018712,0.020896,0.005181
4,0.098786,0.054873,0.027526,0.02932,0.057349,0.016468,0.010589,0.041939,0.017893,0.029286,...,0.015715,0.01787,0.018582,0.027421,0.022162,0.015189,0.032124,0.026685,0.019583,0.020602
5,0.072017,0.016548,0.020416,0.034698,0.022088,0.024852,0.02377,0.037323,0.022141,0.016843,...,0.011978,0.019331,0.017771,0.029115,0.020808,0.016518,0.027818,0.013195,0.020285,0.017344
6,0.017177,0.026664,0.035364,0.030599,0.030599,0.030599,0.022406,0.043562,0.030541,0.022524,...,0.017977,0.018188,0.019372,0.044391,0.029242,0.021658,0.029428,0.036888,0.028218,0.102916
7,0.043499,0.022537,0.03144,0.028281,0.074571,0.020454,0.027531,0.036248,0.033648,0.020975,...,0.029476,0.030663,0.019188,0.016434,0.037377,0.045369,0.017192,0.026876,0.015449,0.017858


In [46]:
VOCAB['mean_tfidf'] = TFIDF.mean()

## Save Tables

In [61]:
LIB.head(10)

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex,release_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s,1997
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s,1998
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s,1999
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s,2000
5,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Order Of The Phoenix,263038,38,^CHAPTER\s,2003
6,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Half Blood Prince,172720,30,^CHAPTER\s,2005
7,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Deathly Hallows,201393,36,^CHAPTER\s|^EPILOGUE\s,2007


In [48]:
LIB.to_csv(f"{output_dir}LIB.csv", index=True, header=True, sep='|')

In [49]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,1,0,0,"(Mr., NNP)",NNP,Mr.,mr,NN
1,1,1,0,1,"(and, CC)",CC,and,and,CC
1,1,1,0,2,"(Mrs., NNP)",NNP,Mrs.,mrs,NN
1,1,1,0,3,"(Dursley,, NNP)",NNP,"Dursley,",dursley,NN
1,1,1,0,4,"(of, IN)",IN,of,of,IN


In [50]:
CORPUS.to_csv(f"{output_dir}CORPUS.csv", index=True, header=True, sep='|')

In [51]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,stop,porter_stem,dfidf,df,idf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,6,1,5.51646e-06,17.467826,CD,CD,2,"{JJ, CD}",0,0,22.517426,4,5.629357,0.0007
1,12,1,1.103292e-05,16.467826,CD,CD,1,{CD},0,1,40.134885,9,4.459432,0.001128
10,1,2,9.1941e-07,20.052788,CD,CD,1,{CD},0,10,7.629357,1,7.629357,0.000155
11,4,2,3.67764e-06,18.052788,CD,CD,1,{CD},0,11,22.517426,4,5.629357,0.000483
12,1,2,9.1941e-07,20.052788,CD,CD,1,{CD},0,12,7.629357,1,7.629357,0.000142


In [52]:
VOCAB.to_csv(f"{output_dir}VOCAB.csv", index=True, header=True, sep='|')

In [53]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
1,1,a,112
1,1,able,2
1,1,about,14
1,1,above,1
1,1,across,2


In [54]:
BOW.to_csv(f"{output_dir}BOW.csv", index=True, header=True, sep='|')

In [55]:
DTM.head()

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,6,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
DTM.to_csv(f"{output_dir}DTM.csv", index=True, header=True, sep='|')

In [57]:
TFIDF.head()

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.187645,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.035675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.04109,0.032551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
TFIDF.to_csv(f"{output_dir}TFIDF.csv", index=True, header=True, sep='|')

In [59]:
L2.head()

term_str,letter,send,pressed,join,living,whisper,itll,information,distant,fall,...,search,nowhere,yell,armchair,children,cracked,including,staggered,lap,chosen
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.119315,0.052836,0.016048,0.014781,0.03129,0.020927,0.020783,0.003639,0.007975,0.028958,...,0.012793,0.011713,0.01871,0.009358,0.006162,0.008649,0.003315,0.003787,0.014846,0.015092
2,0.059842,0.013064,0.006255,0.019148,0.018095,0.024488,0.053297,0.011008,0.021437,0.015833,...,0.00302,0.007036,0.007763,0.008826,0.003851,0.020627,0.008366,0.0,0.015201,0.004175
3,0.054056,0.025854,0.016552,0.027367,0.02587,0.022182,0.019049,0.020508,0.020969,0.033272,...,0.02662,0.018525,0.013083,0.026116,0.028253,0.007782,0.012734,0.018712,0.020896,0.005181
4,0.098786,0.054873,0.027526,0.02932,0.057349,0.016468,0.010589,0.041939,0.017893,0.029286,...,0.015715,0.01787,0.018582,0.027421,0.022162,0.015189,0.032124,0.026685,0.019583,0.020602
5,0.072017,0.016548,0.020416,0.034698,0.022088,0.024852,0.02377,0.037323,0.022141,0.016843,...,0.011978,0.019331,0.017771,0.029115,0.020808,0.016518,0.027818,0.013195,0.020285,0.017344


In [60]:
L2.to_csv(f"{output_dir}L2.csv", index=True, header=True, sep='|')

# Models

## PCA Components

In [65]:
from sklearn.decomposition import PCA
import plotly_express as px
import seaborn as sns

In [82]:
pca = PCA(n_components=5)
pca.fit(L2)

In [81]:
component_terms = pd.DataFrame(pca.components_[0], index=L2.columns, columns=['weight'])
component_terms.nlargest(5, 'weight')


Unnamed: 0_level_0,weight
term_str,Unnamed: 1_level_1
yeh,0.38775
ter,0.353332
points,0.145003
broom,0.140199
dog,0.123506


In [83]:
component_terms = pd.DataFrame(pca.components_[1], index=L2.columns, columns=['weight'])
component_terms.nsmallest(5, 'weight')

Unnamed: 0_level_0,weight
term_str,Unnamed: 1_level_1
sir,-0.451375
team,-0.152378
class,-0.14398
potion,-0.122926
trunk,-0.10262


In [91]:
components = pd.DataFrame(pca.components_)
components.to_csv(f"{output_dir}components.csv", index=False, header=True, sep='|')


## PCA DCM

In [109]:
pca_engine = PCA(n_components=5)

In [110]:
DCM = pd.DataFrame(pca_engine.fit_transform(L2.fillna(0)), index=L2.index)
DCM.columns = ['PC{}'.format(i) for i in DCM.columns]
DCM = DCM.join(LIB[['title', 'release_year']], on='book_id')
DCM

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4,title,release_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.502736,0.215225,-0.15708,0.025435,-0.033474,Harry Potter And The Sorcerer's Stone,1997
2,0.020576,-0.310681,-0.182649,-0.241561,-0.041807,Harry Potter And The Chamber of Secrets,1998
3,0.136628,-0.097139,0.39211,-0.025286,-0.135177,Harry Potter And The Prisoner of Azkaban,1999
4,-0.097535,0.003704,0.020198,-0.037642,0.299841,Harry Potter And The Goblet of Fire,2000
5,-0.01718,0.011485,0.086805,0.118388,0.161176,Harry Potter And The Order Of The Phoenix,2003
6,-0.185404,-0.138432,-0.141807,0.297056,-0.126046,Harry Potter And The Half Blood Prince,2005
7,-0.359819,0.315837,-0.017577,-0.136391,-0.124513,Harry Potter And The Deathly Hallows,2007


In [105]:
DCM.to_csv(f"{output_dir}DCM.csv", index=False, header=True, sep='|')

## PCA Loadings

In [112]:
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index = L2.columns
LOADINGS.index.name = 'term_str'
LOADINGS

Unnamed: 0_level_0,PC0,PC1,PC2,PC3,PC4
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
letter,0.023789,0.011278,-0.002857,-0.008433,0.019740
send,0.007141,0.006757,-0.001817,0.002177,0.007106
pressed,-0.006390,0.004133,-0.000344,0.005817,0.000135
join,-0.004735,-0.000175,0.003066,0.003154,0.002001
living,-0.011524,0.014407,-0.000542,-0.004646,0.002301
...,...,...,...,...,...
cracked,-0.009951,0.004917,-0.003501,-0.003187,-0.003349
including,-0.007141,-0.001041,0.001277,0.005956,0.006052
staggered,-0.009101,0.002061,0.002269,0.007153,-0.000522
lap,-0.001479,-0.001786,0.000807,0.003941,-0.000295


In [95]:
LOADINGS.to_csv(f"{output_dir}LOADINGS.csv", index=False, header=True, sep='|')

## PCA Viz 1

In [96]:
def vis_pcs(M, a, b, label='author', hover_name='label', symbol=None, size=None):
    return px.scatter(M, f"PC{a}", f"PC{b}", color=label, hover_name=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)

In [129]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    X = LOADINGS.join(VOCAB)
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos_group', 
                      marginal_x='box', height=800)

In [117]:
vis_pcs(DCM, 0, 1, label = 'release_year', hover_name='title')

In [135]:
vis_loadings(0,1, hover_name = 'release_year')

## PCA Viz 2

In [134]:
vis_pcs(DCM, 2, 3, label = 'release_year', hover_name='title')

In [136]:
vis_loadings(2,3, hover_name = 'release_year')

## LDA Topic

In [137]:
DTM

Unnamed: 0_level_0,term_str,0,1,10,11,12,1230,125,1289,1296,12th,...,zombie,zone,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming,éclairs
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,6,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
7,33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
DOCS = pd.DataFrame(index=TFIDF.index).join(LIB)

In [144]:
VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM.sum(1)

In [145]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,stop,porter_stem,dfidf,df,idf,mean_tfidf,doc_count
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,6,1,5.516460e-06,17.467826,CD,CD,2,"{JJ, CD}",0,0,22.517426,4,5.629357,0.000700,4
1,12,1,1.103292e-05,16.467826,CD,CD,1,{CD},0,1,40.134885,9,4.459432,0.001128,9
10,1,2,9.194100e-07,20.052788,CD,CD,1,{CD},0,10,7.629357,1,7.629357,0.000155,1
11,4,2,3.677640e-06,18.052788,CD,CD,1,{CD},0,11,22.517426,4,5.629357,0.000483,4
12,1,2,9.194100e-07,20.052788,CD,CD,1,{CD},0,12,7.629357,1,7.629357,0.000142,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoological,1,10,9.194100e-07,20.052788,JJ,JJ,1,{JJ},0,zoolog,7.629357,1,7.629357,0.000166,1
zoom,8,4,7.355280e-06,17.052788,NN,NN,2,"{NN, VB}",0,zoom,37.034853,8,4.629357,0.000598,8
zoomed,57,6,5.240637e-05,14.219898,VBD,VB,2,"{VBN, VBD}",0,zoom,90.494306,38,2.381429,0.002594,38
zooming,31,7,2.850171e-05,15.098592,VBG,VB,2,"{NN, VBG}",0,zoom,74.637511,25,2.985500,0.001740,25


In [146]:
ngram_range = (1, 2)
n_terms = 4000
n_topics = 40
max_iter = 20
n_top_terms = 9

In [147]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [148]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

In [156]:
TOKENS = CORPUS.copy()
labels = ['source_file_path', 'title', 'book_len', 'n_chaps', 'chap_regex', 'release_year']

In [157]:
import pandas as pd # Put here again in case we copy into a separate file
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

class TopicExplorer:
    
    n_features = 4000
    stopwords = 'english'
    lda_num_topics = 20
    lda_max_iter = 5
    lda_n_top_terms = 7
    
    def __init__(self, tokens_df, lib_df, bag, labels=[]):
        self.TOKENS = tokens_df
        self.LIB = lib_df
        self.bag = bag
        self.labels = labels
        
    def generate_tables(self):
        print("BAG:", self.bag[-1])
        print("LABELS:", self.labels)
        print("Getting DOCS")
        self._get_docs()
        print("Getting TERMS")
        self._get_count_model()
        print("Getting THETA, PHI")
        self._get_topic_model()
        print("Getting TOPICS")
        self._get_topics()
        print('Binding LIB labels to THETA')
        self._bind_labels()
        print("Done.")
        return self
        
    def _get_docs(self, pos_remove_pat=r'^NNS?$'):
        self.DOCS = self.TOKENS[self.TOKENS.pos.str.match(pos_remove_pat)]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})
        
    def _get_count_model(self):
        self.count_engine = CountVectorizer(max_features=self.n_features, 
                                            stop_words=self.stopwords)
        self.count_model = self.count_engine.fit_transform(self.DOCS.doc_str)
        self.TERMS = self.count_engine.get_feature_names_out()
        
    def _get_topic_model(self):
        self.lda_engine = LDA(n_components=self.lda_num_topics, 
                              max_iter=self.lda_max_iter, 
                              learning_offset=50., 
                              random_state=0)
        self.THETA = pd.DataFrame(self.lda_engine.fit_transform(self.count_model), 
                                  index=self.DOCS.index)
        self.THETA.columns.name = 'topic_id'
        self.PHI = pd.DataFrame(self.lda_engine.components_, columns=self.TERMS)
        self.PHI.index.name = 'topic_id'
        self.PHI.columns.name = 'term_str'
        
    def _get_topics(self, n_terms=10):
        self.TOPICS = self.PHI.stack().to_frame('weight')\
            .groupby('topic_id')\
            .apply(lambda x: x.weight.sort_values(ascending=False)\
               .head(self.lda_n_top_terms)\
               .reset_index()\
               .drop('topic_id', axis=1)\
               .term_str)
        self.TOPICS['label'] = self.TOPICS[[t for t in range(self.lda_n_top_terms)]]\
            .apply(lambda x: str(x.name)\
                   .zfill(len(str(self.lda_num_topics))) + ' ' + ' '.join(x), axis=1)
        self.TOPICS['doc_weight_sum'] = self.THETA.sum()
        self.topic_cols = [t for t in range(self.lda_num_topics)]
        
    def _bind_labels(self):
        self.LABELS = {}
        self.LABEL_VALUES = {}
        for label in self.labels:
            self.THETA[label] = self.THETA\
                .apply(lambda x: self.LIB.loc[x.name[0], label], axis=1)
            self.LABELS[label] = self.THETA.groupby(label)[self.topic_cols].mean().T  
            self.THETA = self.THETA.drop(label, axis=1) # Don't keep the column
            self.LABELS[label].index.name = 'topic_id'
            self.LABELS[label]['label'] = self.TOPICS['label']
            self.LABEL_VALUES[label] = sorted(list(set(self.LIB[label])))
            
    def show_dominant_label_topic(self, label):
        X = self.LABELS[label][self.LABEL_VALUES[label]].idxmax()
        return X.to_frame('topic_id').topic_id.map(self.TOPICS.label)
            
    def show_label_values(self):
        for label in self.LABEL_VALUES:
            print(label, ": ", self.LABEL_VALUES[label])
        
    def show_topic_bar(self):
        fig_height = self.lda_num_topics / 3
        self.TOPICS.sort_values('doc_weight_sum', ascending=True)\
            .plot.barh(y='doc_weight_sum', x='label', figsize=(5, fig_height));
        
    def show_topic_label_heatmap(self, label):
        return MP.LABELS[label][MP.LABEL_VALUES[label]].style.background_gradient()
        
    def show_label_comparison_plot(self, label, label_value_x, label_value_y):
        px.scatter(self.LABELS[label].reset_index(), label_value_x, label_value_y, 
                   hover_name='label', text='topic_id', width=800, height=600)\
            .update_traces(mode='text').show()        

In [158]:
MC = TopicExplorer(TOKENS, LIB, CHAP, labels).generate_tables()

BAG: chap_id
LABELS: ['source_file_path', 'title', 'book_len', 'n_chaps', 'chap_regex', 'release_year']
Getting DOCS
Getting TERMS
Getting THETA, PHI
Getting TOPICS
Binding LIB labels to THETA
Done.


## PHI

In [167]:
PHI = MC.PHI
PHI

term_str,aback,abilities,ability,abou,abruptly,absence,access,accident,accidents,accio,...,young,youre,youth,youve,zat,ze,zey,zis,zoo,zoom
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.05,0.05,1.110792,1.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,0.05,0.05,0.05,0.05,2.50378,0.05,0.05,1.045749,0.05,0.05,...,0.05,2.559411,0.05,4.049422,0.05,0.05,0.05,0.05,1.005102,0.05
2,1.679892,0.05,0.05,0.050001,0.05,1.67497,1.048089,1.161381,1.061535,1.05,...,0.05,2.072247,1.041918,5.469692,0.104726,0.060328,0.050008,0.991598,0.05,1.040148
3,0.05,0.05,0.05,0.05,0.05,0.05,1.032329,0.05,0.05,0.05,...,0.05,0.05,0.05,0.852217,0.05,0.05,0.05,0.05,0.05,0.05
4,0.530783,0.05,0.05,0.05,3.08131,0.053438,0.05,1.137543,0.05,0.05,...,0.05,31.890048,0.05,22.303199,0.05,1.05,0.05,0.05,0.05,0.05
5,3.527404,0.05,0.082224,0.05,1.016903,0.05,0.368683,0.684304,0.05,0.05,...,2.05,11.554047,0.965783,14.617696,0.05,0.05,0.05,0.05,0.05,0.05
6,6.189189,5.472347,10.291282,0.050121,3.788826,11.559217,2.067671,5.533421,3.318802,0.056908,...,3.05,120.24146,3.630099,64.20612,5.979366,3.87724,3.05,0.05,0.05,0.05
7,0.942032,0.05,0.05,0.05,0.05,0.05,0.051911,0.05,0.05,0.05,...,1.05,1.213687,0.05,0.050001,0.05,0.22276,1.05,0.05,0.05,0.05
8,0.05,0.05,0.05,0.052361,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,1.447408,1.05,3.988939,0.05,0.05,0.05,0.05,0.05,0.05
9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [168]:
PHI.to_csv(f"{output_dir}PHI.csv", index=True, header=True, sep='|')

## THETA

In [166]:
PHI.columns

RangeIndex(start=0, stop=20, step=1, name='topic_id')