# F2-F4 Table Setup 

Annie Williams (maw3as@virginia.edu)  
DS 5001  
2 May 2021  


This notebook contains the code to create the initial F2-F4 `DOC`, `LIB`, `TOKEN`, and `VOCAB` tables. 

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/annewilliams/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annewilliams/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annewilliams/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/annewilliams/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [3]:
data_in = './source_files'
data_out = './F2-F4_tables'

In [4]:
# doc_id:    the unique id for each document, 
# book_num:  each book within the iliad / odyssey 

OHCO = ['doc_id', 'book_num', 'stanza_num', 'line_num', 'token_num']
LINES = OHCO[:4]
STANZAS = OHCO[:3]
BOOKS = OHCO[:2]
DOCS = OHCO[:1]

## Set up initial F2 `LIB` and `DOC` tables
* Code based on Module 4 Pipeline notebook

In [5]:
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
chap_pats = {
    100: {
        'start_line': 373,
        'end_line': 10838,
        'book': re.compile(r'^BOOK\s+{}.$'.format(roman))
    },
    200: {
        'start_line': 76,
        'end_line': 14568,
        'book': re.compile(r'^BOOK\s+{}.$'.format(roman))
    },
    300: {
        'start_line': 955,
        'end_line': 16699,
        'book': re.compile(r"^THE\s+\w+\-?\w+?\s+BOOK OF HOMER’S ILIADS$")
    }, 
    400: {
        'start_line': 423,
        'end_line': 18344,
        'book': re.compile(r'^THE\s+\w+\-?\w+?\s+BOOK OF HOMER’S ODYSSEYS')
    }
}

In [6]:
def acquire_epubs(epub_list, chap_pats, OHCO=OHCO):
    
    my_lib = []
    my_doc = []

    for epub_file in epub_list:
        
        # Get ID from filename
        doc_id = int(epub_file.split('/')[-1].split('-')[0])
                      
        print("BOOK ID", doc_id)
        
        # Import file as lines
        lines = open(epub_file, 'r', encoding='utf-8-sig').readlines()
        df = pd.DataFrame(lines, columns=['line_str'])
        df.index.name = 'line_num'
        df['doc_id'] = doc_id
        
        # FIX CHARACTERS TO IMPROVE TOKENIZATION
        df.line_str = df.line_str.str.replace('—', ' — ')
        df.line_str = df.line_str.str.replace('-', ' - ')
        
        # Get book title and put into LIB table -- note problems, though
        book_title = re.sub(r"The Project Gutenberg eBook( of|,) ", "", df.loc[0].line_str, flags=re.IGNORECASE)
        book_title = re.sub(r"Project Gutenberg's ", "", book_title, flags=re.IGNORECASE)
        
        # Remove cruft
        a = chap_pats[doc_id]['start_line'] - 1
        b = chap_pats[doc_id]['end_line'] + 1
        df = df.iloc[a:b]
        
        # Chunk by book
        chap_lines = df.line_str.str.match(chap_pats[doc_id]['book'])
        book_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
        df.loc[chap_lines, 'book_num'] = book_nums
        df.book_num = df.book_num.ffill()

        # Clean up
        df = df.dropna(subset=['book_num']) # Remove everything before book 1
        df = df.loc[~chap_lines] # Remove book heading lines
        df['book_num'] = df['book_num'].astype('int')
        
        # Group -- Note that we exclude the book level in the OHCO at this point
        df = df.groupby(OHCO[1:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
        
        # Split into paragrpahs
        df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'stanza_str'})
        df.index.names = OHCO[1:3] # MAY NOT BE NECESSARY UNTIL THE END
        df['stanza_str'] = df['stanza_str'].str.replace(r'\n', ' ', regex=True).str.strip()
        df = df[~df['stanza_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        
        # Set index
        df['doc_id'] = doc_id
        df = df.reset_index().set_index(OHCO[:3])

        # Register
        my_lib.append((doc_id, book_title, epub_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['doc_id', 'book_title', 'book_file']).set_index('doc_id')
    print("Done.")
    return library, docs

In [7]:
epubs = [epub for epub in sorted(glob('{}/*.txt'.format(data_in)))]
LIB, DOC = acquire_epubs(epubs, chap_pats)

BOOK ID 100
BOOK ID 200
BOOK ID 300
BOOK ID 400
Done.


In [36]:
# manually add some features to the LIB table because there are so few documents 
LIB["translator"] = ['Butler', 'Butler', 'Chapman', 'Chapman']

In [40]:
LIB

Unnamed: 0_level_0,book_title,book_file,translator
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,"The Iliad, by Homer\n",./data_in/100-iliad-butler.txt,Butler
200,"The Odyssey, by Homer\n",./data_in/200-odyssey-butler.txt,Butler
300,"The Iliads of Homer, by Homer\n",./data_in/300-iliad-chapman.txt,Chapman
400,"The Odysseys of Homer, by Homer\n",./data_in/400-odyssey-chapman.txt,Chapman


In [41]:
DOC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,stanza_str
doc_id,book_num,stanza_num,Unnamed: 3_level_1
100,1,1,"Jove sends a lying dream to Agamemnon, who the..."
100,1,2,"chiefs in assembly, and proposes to sound the ..."
100,1,3,the end they march to fight — Catalogue of the...
100,1,4,forces.
100,1,5,Now the other gods and the armed warriors on t...


## Create F2 `TOKEN` table, and add F3 `pos` attribute

We use NLTK this time. Note that this process takes some time, mainly because the NLTK functions are not optimized for dataframes.

In [11]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    
    # Paragraphs to Sentences
    df = doc_df.stanza_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [12]:
TOKEN = tokenize(DOC, ws=True)

In [13]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str
doc_id,book_num,stanza_num,line_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,1,1,0,0,"(Jove, NNP)",NNP,Jove
100,1,1,0,1,"(sends, VBZ)",VBZ,sends
100,1,1,0,2,"(a, DT)",DT,a
100,1,1,0,3,"(lying, JJ)",JJ,lying
100,1,1,0,4,"(dream, NN)",NN,dream


## Create `VOCAB` table

Extract a vocabulary from the TOKEN table

In [15]:
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '', regex=True)

In [16]:
VOCAB = TOKEN.term_str.value_counts().to_frame().rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [17]:
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

In [18]:
VOCAB

Unnamed: 0_level_0,term_str,n,num
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,4733,0
1,1,48,1
2,10,8,1
3,100,2,1
4,1000,1,1
...,...,...,...
19728,ῥα,1,0
19729,ῥοτὸς,1,0
19730,ῥοᾓ,1,0
19731,ῥέω,1,0


In [19]:
# end of F2 corpus, begin F3 corpus ? 

## Continue building on F3 by annotating `VOCAB` table 

I use NLTK's built in stopword list for English. 

The following code chunks add stop words, stems, and `pos_max`.

### 1. Add Stopwords

In [20]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [39]:
sw.sample(10)

Unnamed: 0_level_0,dummy
term_str,Unnamed: 1_level_1
themselves,1
mustn't,1
did,1
just,1
them,1
the,1
that'll,1
any,1
no,1
mustn,1


In [22]:
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [23]:
VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,term_str,n,num,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1799,be,2010,0,1
17160,they,3050,0,1
675,after,418,0,1
18874,when,2302,0,1
11488,now,1622,0,1
17125,there,1274,0,1
8098,haven,19,0,1
17403,to,13209,0,1
1973,below,22,0,1
11177,myself,219,0,1


### 2. Add Stems

In [24]:
stemmer = PorterStemmer()
VOCAB['p_stem'] = VOCAB.term_str.apply(stemmer.stem)

In [25]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
747,ait,1,0,0,ait
2482,brine,8,0,0,brine
17184,think,180,0,0,think
10902,misbehaved,1,0,0,misbehav
2024,bessa,2,0,0,bessa
13867,reasons,3,0,0,reason
14398,rhetoric,2,0,0,rhetor
2207,blushd,2,0,0,blushd
12940,polyæmons,1,0,0,polyæmon
14386,rew,2,0,0,rew


### 3. Add pos_max

In [26]:
M = TOKEN.groupby(['term_str','pos']).pos.count().unstack(fill_value=0)

In [27]:
VOCAB = VOCAB.reset_index().set_index('term_str')
VOCAB['pos_max'] = M.idxmax(1)
VOCAB = VOCAB.reset_index().set_index('term_id')

In [28]:
VOCAB[VOCAB.pos_max == 'NN'].sort_values('n', ascending=False).head(20)

Unnamed: 0_level_0,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15789,son,1762,0,0,son,NN
10456,man,1263,0,0,man,NN
17224,thou,854,0,0,thou,NN
7970,hand,769,0,0,hand,NN
8560,house,745,0,0,hous,NN
9663,king,689,0,0,king,NN
6705,fight,681,0,0,fight,NN
17105,thee,667,0,0,thee,NN
18756,way,632,0,0,way,NN
17349,till,583,0,0,till,NN


## Add F4 vector space representations of the `TOKEN` data

The following code chunks create the vector space representations of the `TOKEN` data, including relevant statistics and TFIDF scores. 

* Code taken from module 5 guide and homework

In [29]:
N_vocab = VOCAB.shape[0]
U_vocab = 1/N_vocab

In [30]:
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()  # Probability
VOCAB['s'] = 1 / VOCAB.p              # Surprise
VOCAB['i'] = np.log2(VOCAB.s)         # Information
VOCAB['h'] = VOCAB.p * VOCAB.i        # Entropy

In [31]:
VOCAB['wlen'] = VOCAB.term_str.str.len()

In [32]:
VOCAB = VOCAB.set_index('term_str')

In [33]:
def get_tfidf(tokens, vocab, bag, tf_type='n', item_type='term_str', alpha=.4, new_col_suffix=''):
    
    # Create BOW
    BOW = tokens.groupby(bag+[item_type])[item_type].count()\
        .to_frame('n')
    BOW['c'] = 1
    
    # Compute TF
    D = BOW.groupby(bag).n
    if tf_type == 'n':
        BOW['tf'] = BOW.n
    elif tf_type == 'sum':
        BOW['tf'] = D.apply(lambda x: x / x.sum()) # cp = P(w|d)
    elif tf_type == 'l2':
        BOW['tf'] = D.apply(lambda x: x / np.sqrt((x**2).sum()))
    elif tf_type == 'max':
        BOW['tf'] = D.apply(lambda x: alpha + (1-alpha) * (x / x.max()))
    elif tf_type == 'log':
        BOW['tf'] = D.apply(lambda x: np.log2(1 + x))
    elif tf_type == 'sub':
        BOW['tf'] = D.apply(lambda x: 1 + np.log2(x))
    elif tf_type == 'bool':
        BOW['tf'] = BOW.c
    elif tf_type == 'bool2':
        BOW['tf'] = D.apply(lambda x: 1 / len(x))
    
    # Normalize TF
    
    # Compute IDF
    vocab['df'] = BOW.groupby('term_str').n.count()
    N_docs = len(D.groups)
    vocab['idf'] = np.log2(N_docs/vocab.df)
    
    # Compute TFIDF
    BOW['tfidf'] = BOW.tf * vocab.idf
    
    # Compute aggregate TFIDF
    col = 'tfidf_sum' + new_col_suffix
    vocab[col] = BOW.groupby(item_type)['tfidf'].sum()
    
    return vocab

In [34]:
VOCAB = get_tfidf(TOKEN, VOCAB, bag=DOCS, tf_type='max', new_col_suffix='_doc_max', alpha=0)

In [35]:
VOCAB

Unnamed: 0_level_0,n,num,stop,p_stem,pos_max,p,s,i,h,wlen,df,idf,tfidf_sum_doc_max
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,4733,0,0,,:,0.008712,114.788506,6.842834,0.059613,0,4,0.000000,0.000000
1,48,1,0,1,NN,0.000088,11318.625000,13.466411,0.001190,1,3,0.415037,0.003287
10,8,1,0,10,JJ,0.000015,67911.750000,16.051374,0.000236,2,3,0.415037,0.000573
100,2,1,0,100,CD,0.000004,271647.000000,18.051374,0.000066,3,1,2.000000,0.000598
1000,1,1,0,1000,CD,0.000002,543294.000000,19.051374,0.000035,4,1,2.000000,0.000299
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ῥα,1,0,0,ῥα,NN,0.000002,543294.000000,19.051374,0.000035,2,1,2.000000,0.000299
ῥοτὸς,1,0,0,ῥοτὸς,NNP,0.000002,543294.000000,19.051374,0.000035,5,1,2.000000,0.000361
ῥοᾓ,1,0,0,ῥοᾓ,NNP,0.000002,543294.000000,19.051374,0.000035,3,1,2.000000,0.000361
ῥέω,1,0,0,ῥέω,NNP,0.000002,543294.000000,19.051374,0.000035,3,1,2.000000,0.000361


In [44]:
DOC.to_csv('{}/DOC.csv'.format(data_out))
LIB.to_csv('{}/LIB.csv'.format(data_out))
VOCAB.to_csv('{}/VOCAB.csv'.format(data_out))
TOKEN.to_csv('{}/TOKEN.csv'.format(data_out))