In [71]:
import pandas as pd
from textparser import TextParser
import os
import re
import numpy as np
import nltk
from glob import glob
from nltk.stem.porter import PorterStemmer
from numpy.linalg import norm

In [2]:
os.getcwd()

'/Users/michaelhammer/Desktop/ETA_Final'

In [3]:
source_files = '/Users/michaelhammer/Desktop/ETA_Final/novels'
output_dir = '/Users/michaelhammer/Desktop/ETA_Final/output'

In [35]:
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
SENT = OHCO[:4]
PARA = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]

In [5]:
clip_pats = [
    r"\s*START OF BOOK",
    r"\s*THE END"
]

ohco_pat_list = [
    (1,  rf"^CHAPTER\s"),
    (2,   rf"^CHAPTER\s"),
    (3,   rf"^CHAPTER\s"),
    (4,   rf"^CHAPTER\s")
]

In [6]:
source_file_list = sorted(glob(f"{source_files}/*.*"))
source_file_list

["/Users/michaelhammer/Desktop/ETA_Final/novels/Book1_The_Sorcerer's_Stone.txt",
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book2_The_Chamber_of_Secrets.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book3_The_Prisoner_of_Azkaban.txt',
 '/Users/michaelhammer/Desktop/ETA_Final/novels/Book4_The_Goblet_of_Fire.txt']

In [7]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('/')[-1].split('_')[0].replace('Book',''))
    book_title = 'Harry Potter And ' + source_file_path.split('/')[-1].split('-')[0].replace('_', ' ')[6:].split('.')[0]
    book_data.append((book_id, source_file_path, book_title))
book_data 

[(1,
  "/Users/michaelhammer/Desktop/ETA_Final/novels/Book1_The_Sorcerer's_Stone.txt",
  "Harry Potter And The Sorcerer's Stone"),
 (2,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book2_The_Chamber_of_Secrets.txt',
  'Harry Potter And The Chamber of Secrets'),
 (3,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book3_The_Prisoner_of_Azkaban.txt',
  'Harry Potter And The Prisoner of Azkaban'),
 (4,
  '/Users/michaelhammer/Desktop/ETA_Final/novels/Book4_The_Goblet_of_Fire.txt',
  'Harry Potter And The Goblet of Fire')]

In [8]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','title'])\
    .set_index('book_id').sort_index()
LIB

Unnamed: 0_level_0,source_file_path,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire


In [9]:
books = []
for pat in ohco_pat_list:
    
    book_id, chap_regex = pat
    print("Tokenizing", book_id, LIB.loc[book_id].title)
    ohco_pats = [('chap', chap_regex, 'm')]
    src_file_path = LIB.loc[book_id].source_file_path
    
    text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
    text.verbose = False
    text.strip_hyphens = True
    text.strip_whitespace = True
    text.import_source().parse_tokens();
    text.TOKENS['book_id'] = book_id
    text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)
    
    books.append(text.TOKENS)

Tokenizing 1 Harry Potter And The Sorcerer's Stone
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 2 Harry Potter And The Chamber of Secrets
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 3 Harry Potter And The Prisoner of Azkaban
line_str chap_str
Index(['chap_str'], dtype='object')
Tokenizing 4 Harry Potter And The Goblet of Fire
line_str chap_str
Index(['chap_str'], dtype='object')


In [10]:
CORPUS = pd.concat(books).sort_index()
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,0,0,"(Mr., NNP)",NNP,Mr.,mr
1,1,1,0,1,"(and, CC)",CC,and,and
1,1,1,0,2,"(Mrs., NNP)",NNP,Mrs.,mrs
1,1,1,0,3,"(Dursley,, NNP)",NNP,"Dursley,",dursley
1,1,1,0,4,"(of, IN)",IN,of,of
...,...,...,...,...,...,...,...,...
4,37,5,11,15,"(meet, VB)",VB,meet,meet
4,37,5,11,16,"(it, PRP)",PRP,it,it
4,37,5,11,17,"(when, WRB)",WRB,when,when
4,37,5,11,18,"(it, PRP)",PRP,it,it


## Add to LIB

In [11]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [12]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

In [13]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [14]:
LIB.sort_values('book_len')

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s


In [15]:
LIB['release_year'] = [1997, 1998, 1999, 2000]
LIB

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex,release_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s,1997
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s,1998
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s,1999
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s,2000


## VOCAB Table

In [16]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

token_str
"       810
—       646
—”      356
…       197
…”      189
."      187
..."    105
..       49
?"       41
"...     19
“        18
?”       12
*        12
“—       12
“…       11
.."      10
”         9
"),       7
".        7
.'        6
!"        4
.'"       2
....      2
'"        2
**        2
",        1
";        1
").       1
—’        1
),        1
…”)       1
.,        1
_"        1
");       1
"'.       1
/         1
'.        1
Name: count, dtype: int64

In [17]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [18]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [19]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [20]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CORPUS['pos_group'] = CORPUS.pos.str[:2]


In [21]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [22]:
TPM = CORPUS[['term_str','pos']].value_counts().unstack()

In [23]:
VOCAB['n_pos'] = TPM.count(1)

In [24]:
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [25]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [26]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [28]:
stemmer1 = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [29]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,stop,porter_stem
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,6,1,0.000013,16.227826,CD,CD,2,"{JJ, CD}",0,0
1,7,1,0.000015,16.005433,CD,CD,1,{CD},0,1
11,4,2,0.000009,16.812788,CD,CD,1,{CD},0,11
1230,1,4,0.000002,18.812788,CD,CD,1,{CD},0,1230
125,1,3,0.000002,18.812788,CD,CD,1,{CD},0,125
...,...,...,...,...,...,...,...,...,...,...
zoo,9,3,0.000020,15.642863,NN,NN,1,{NN},0,zoo
zoological,1,10,0.000002,18.812788,JJ,JJ,1,{JJ},0,zoolog
zoom,5,4,0.000011,16.490860,NN,NN,1,{NN},0,zoom
zoomed,21,6,0.000046,14.420471,VBD,VB,2,"{VBN, VBD}",0,zoom


## BOW Table

In [30]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [36]:
bag = CHAP
bag

['book_id', 'chap_id']

In [37]:
BOW = create_bow(CORPUS, bag)
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
1,1,a,112
1,1,able,2
1,1,about,14
1,1,above,1
1,1,across,2
...,...,...,...
4,37,your,9
4,37,youre,3
4,37,yours,1
4,37,yourself,2


# DTM

In [38]:
DTM = BOW.n.unstack(fill_value=0)
DTM

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,6,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TFIDF & DFIDF

In [39]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

In [40]:
tf_method = 'max'

In [41]:
TFIDF, DFIDF = get_tfidf(BOW, tf_method)
TFIDF

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15182,0.0,0.0,0.0,0.0
1,3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,4,0.000000,0.033861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,5,0.033245,0.030895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,33,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,34,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,35,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,36,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


In [42]:
VOCAB['dfidf'] = DFIDF

In [59]:
VOCAB.dfidf.sort_values(ascending=False).head(20)

term_str
pain          49.885704
itself        49.885704
hello         49.885704
touch         49.885704
throat        49.885704
thousand      49.885704
marble        49.885704
mum           49.885704
definitely    49.885704
flat          49.885704
theyve        49.885704
clutching     49.885704
glad          49.885704
size          49.885704
swung         49.885704
fight         49.885704
scarlet       49.885704
playing       49.885704
tall          49.885704
terrible      49.885704
Name: dfidf, dtype: float64

In [52]:
VOCAB['df'] = DTM.astype('bool').sum()
VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15182,0.0,0.0,0.0,0.0
1,3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,4,0.000000,0.033861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,5,0.033245,0.030895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,33,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,34,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,35,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,36,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


In [54]:

BOW = BOW.drop(columns=['tfidf'])
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
1,1,a,112
1,1,able,2
1,1,about,14
1,1,above,1
1,1,across,2
...,...,...,...
4,37,your,9
4,37,youre,3
4,37,yours,1
4,37,yourself,2


In [65]:
TFIDF

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15182,0.0,0.0,0.0,0.0
1,3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,4,0.000000,0.033861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
1,5,0.033245,0.030895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,33,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,34,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,35,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0
4,36,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0


# TFIDF - L2 Normalized

In [67]:
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = "NN NNS VB VBD VBG VBN VBP VBZ JJ JJR JJS RB RBR RBS".split() # Open categories with no proper nouns

In [68]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [69]:
M = TFIDF[VIDX].fillna(0).groupby('book_id').mean()

In [70]:
M

term_str,asleep,glad,food,fight,clutching,question,marble,color,swung,tall,...,car,colored,wiped,wished,wishing,tip,thoughtfully,oak,thud,bother
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.007879,0.004382,0.003551,0.003444,0.002542,0.004112,0.002921,0.001103,0.005299,0.004309,...,0.014845,0.001683,0.002274,0.005335,0.002408,0.0,0.001466,0.002882,0.001189,0.001463
2,0.002326,0.002641,0.003233,0.00167,0.003723,0.00313,0.003644,0.00286,0.002954,0.001559,...,0.041325,0.001784,0.001781,0.003264,0.003549,0.002494,0.004287,0.002977,0.003524,0.001492
3,0.003983,0.002634,0.001791,0.002788,0.002866,0.003285,0.002983,0.00366,0.001382,0.002526,...,0.007655,0.002243,0.003706,0.001338,0.002127,0.001231,0.003742,0.001667,0.003054,0.001347
4,0.002432,0.00324,0.00634,0.0036,0.00398,0.002166,0.004263,0.003012,0.002757,0.003729,...,0.002647,0.004488,0.00188,0.002016,0.001371,0.006921,0.00217,0.00148,0.001666,0.00318


In [72]:
L2 = M.apply(lambda x: x / norm(x), 1)
L2

term_str,asleep,glad,food,fight,clutching,question,marble,color,swung,tall,...,car,colored,wiped,wished,wishing,tip,thoughtfully,oak,thud,bother
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.049754,0.027669,0.022422,0.021745,0.016051,0.025964,0.018443,0.006968,0.033462,0.027208,...,0.09374,0.010629,0.014361,0.033686,0.015205,0.0,0.009259,0.018196,0.007506,0.009236
2,0.016983,0.019284,0.023608,0.012192,0.027187,0.022856,0.026609,0.020883,0.021568,0.011382,...,0.301756,0.013028,0.013003,0.023832,0.025914,0.018215,0.031303,0.021737,0.025729,0.010898
3,0.029613,0.019585,0.013315,0.020728,0.021308,0.024425,0.02218,0.027212,0.010272,0.018782,...,0.056912,0.016676,0.027555,0.009945,0.015812,0.009148,0.02782,0.012392,0.022702,0.010014
4,0.020264,0.026993,0.052824,0.030001,0.033164,0.018046,0.035522,0.025098,0.022973,0.031075,...,0.022053,0.037392,0.015664,0.0168,0.011422,0.057669,0.018081,0.012329,0.013881,0.026495


In [77]:
VOCAB['mean_tfidf'] = TFIDF.mean()

## Save Tables

In [78]:
LIB.head()

Unnamed: 0_level_0,source_file_path,title,book_len,n_chaps,chap_regex,release_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Sorcerer's Stone,77946,17,^CHAPTER\s,1997
2,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Chamber of Secrets,87122,18,^CHAPTER\s,1998
3,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Prisoner of Azkaban,105599,22,^CHAPTER\s,1999
4,/Users/michaelhammer/Desktop/ETA_Final/novels/...,Harry Potter And The Goblet of Fire,192543,37,^CHAPTER\s,2000


In [87]:
LIB.to_csv(f"{output_dir}LIB.csv", index=True, header=True, sep='|')

In [80]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,1,0,0,"(Mr., NNP)",NNP,Mr.,mr,NN
1,1,1,0,1,"(and, CC)",CC,and,and,CC
1,1,1,0,2,"(Mrs., NNP)",NNP,Mrs.,mrs,NN
1,1,1,0,3,"(Dursley,, NNP)",NNP,"Dursley,",dursley,NN
1,1,1,0,4,"(of, IN)",IN,of,of,IN


In [88]:
CORPUS.to_csv(f"{output_dir}CORPUS.csv", index=True, header=True, sep='|')

In [82]:
VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos,cat_pos,stop,porter_stem,dfidf,df,idf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,6,1,1.3e-05,16.227826,CD,CD,2,"{JJ, CD}",0,0,18.218355,4,4.554589,0.001192
1,7,1,1.5e-05,16.005433,CD,CD,1,{CD},0,1,21.163304,5,4.232661,0.00154
11,4,2,9e-06,16.812788,CD,CD,1,{CD},0,11,18.218355,4,4.554589,0.000823
1230,1,4,2e-06,18.812788,CD,CD,1,{CD},0,1230,6.554589,1,6.554589,0.00027
125,1,3,2e-06,18.812788,CD,CD,1,{CD},0,125,6.554589,1,6.554589,0.000242


In [89]:
VOCAB.to_csv(f"{output_dir}VOCAB.csv", index=True, header=True, sep='|')

In [84]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
1,1,a,112
1,1,able,2
1,1,about,14
1,1,above,1
1,1,across,2


In [90]:
BOW.to_csv(f"{output_dir}BOW.csv", index=True, header=True, sep='|')

In [86]:
DTM.head()

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,6,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
DTM.to_csv(f"{output_dir}DTM.csv", index=True, header=True, sep='|')

In [92]:
TFIDF.head()

Unnamed: 0_level_0,term_str,0,1,11,1230,125,1289,1296,13,1473,1492,...,zis,zograf,zombie,zonko,zonkos,zoo,zoological,zoom,zoomed,zooming
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.15182,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.033861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.033245,0.030895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
TFIDF.to_csv(f"{output_dir}TFIDF.csv", index=True, header=True, sep='|')

In [95]:
L2.head()

term_str,asleep,glad,food,fight,clutching,question,marble,color,swung,tall,...,car,colored,wiped,wished,wishing,tip,thoughtfully,oak,thud,bother
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.049754,0.027669,0.022422,0.021745,0.016051,0.025964,0.018443,0.006968,0.033462,0.027208,...,0.09374,0.010629,0.014361,0.033686,0.015205,0.0,0.009259,0.018196,0.007506,0.009236
2,0.016983,0.019284,0.023608,0.012192,0.027187,0.022856,0.026609,0.020883,0.021568,0.011382,...,0.301756,0.013028,0.013003,0.023832,0.025914,0.018215,0.031303,0.021737,0.025729,0.010898
3,0.029613,0.019585,0.013315,0.020728,0.021308,0.024425,0.02218,0.027212,0.010272,0.018782,...,0.056912,0.016676,0.027555,0.009945,0.015812,0.009148,0.02782,0.012392,0.022702,0.010014
4,0.020264,0.026993,0.052824,0.030001,0.033164,0.018046,0.035522,0.025098,0.022973,0.031075,...,0.022053,0.037392,0.015664,0.0168,0.011422,0.057669,0.018081,0.012329,0.013881,0.026495


In [96]:
L2.to_csv(f"{output_dir}L2.csv", index=True, header=True, sep='|')

## 