Author: Pavan Kumar Bondalapati<br>
Email: pb7ak@virginia.edu<br>
Class: DS 5001<br>
Date: May 11, 2021<br>

In [1]:
# Code is adapted from Raf Alvarado
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk

In [2]:
# Downloaded from Gutenberg
books1 = "data_in/21765-0.txt" 
books2 = "data_in/26073-0.txt"
OHCO = ['book_num', 'fable_num', 'para_num', 'sent_num', 'token_num']

In [3]:
def prepare(file, end, book_id, b=0):
        
    # Import file into a dataframe
    df = open(file, 'r', encoding='utf-8-sig').readlines()
    df = pd.DataFrame(df, columns=['line_str'])
    df.index.name = 'line_num'
    df.line_str = df.line_str.str.strip()
    
    title = df.loc[0].line_str.replace('Project Gutenberg\'s ', '')
    df['book_title'] = title
    
    # Remove Gutenberg's front and back matter
    a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
    an = df.loc[a].index[0]
    df = df.loc[an + 1 : end] # Hard coding back matter removal
    
    # Assign numbers to books
    book_lines = df.line_str.str.match(r"^BOOK THE \w+.$", case=True)
    book_nums = [b+i+1 for i in range(df.loc[book_lines].shape[0])]
    df.loc[book_lines, 'book_num'] = book_nums
    df.book_num = df.book_num.ffill()
    
    df = df.loc[~df.book_num.isna()] # Remove book heading lines
    df = df.loc[~book_lines] # Remove everything before Book 1
    df.book_num = df.book_num.astype('int') # Convert book_num from float to int
    
    # Assign numbers to fables
    fable_lines = df.line_str.str.contains(r"FABLE", case=True)
    book_nums, fable_count = np.unique(df.book_num[fable_lines].values, 
                                       return_counts=True)
    fable_nums = [i+1 for b, f in zip(book_nums, fable_count) 
                  for i in range(f)]
    df.loc[fable_lines, 'fable_num'] = fable_nums
    df.fable_num = df.fable_num.ffill()
    
    df = df.loc[~df.fable_num.isna()] # Remove fable heading lines
    df = df.loc[~fable_lines] # Remove everything before Fable 1
    df.fable_num = df.fable_num.astype('int') # Convert fable_num from float to int
    
    # Create LIB table
    book_title, author = title.split(', by ')
    LIB = pd.DataFrame({'book_num':book_nums})
    LIB['book_title'] = title
    LIB['book_id'] = book_id
    LIB['book_file'] = './'+file
    LIB['author'] = author
    LIB['title'] = book_title
    
    return df, LIB

In [4]:
df1, LIB1 = prepare(books1, end=12550, book_id=21765)
df2, LIB2 = prepare(books2, end=13832, book_id=26073, b=7)
LIB = pd.concat([LIB1, LIB2])
LIB = LIB.set_index(['book_num'])
df = pd.concat([df1, df2])
df

Unnamed: 0_level_0,line_str,book_title,book_num,fable_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
528,,"The Metamorphoses of Ovid, by Publius Ovidius ...",1,1
529,God reduces Chaos into order. He separates the...,"The Metamorphoses of Ovid, by Publius Ovidius ...",1,1
530,"disposes the several bodies, of which the univ...","The Metamorphoses of Ovid, by Publius Ovidius ...",1,1
531,their proper situations.,"The Metamorphoses of Ovid, by Publius Ovidius ...",1,1
532,,"The Metamorphoses of Ovid, by Publius Ovidius ...",1,1
...,...,...,...,...
13828,"vows. He adds, that he was made a Divinity by ...","The Metamorphoses of Ovid, by Publius Ovidius ...",15,5
13829,he does not say at what time.,"The Metamorphoses of Ovid, by Publius Ovidius ...",15,5
13830,,"The Metamorphoses of Ovid, by Publius Ovidius ...",15,5
13831,,"The Metamorphoses of Ovid, by Publius Ovidius ...",15,5


In [5]:
LIB

Unnamed: 0_level_0,book_title,book_id,book_file,author,title
book_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
2,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
3,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
4,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
5,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
6,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
7,"The Metamorphoses of Ovid, by Publius Ovidius ...",21765,./data_in/21765-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
8,"The Metamorphoses of Ovid, by Publius Ovidius ...",26073,./data_in/26073-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
9,"The Metamorphoses of Ovid, by Publius Ovidius ...",26073,./data_in/26073-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid
10,"The Metamorphoses of Ovid, by Publius Ovidius ...",26073,./data_in/26073-0.txt,Publius Ovidius Naso,The Metamorphoses of Ovid


In [6]:
# Group lines by fable number
dff = df.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x))\
    .to_frame() # Make big string

In [7]:
# Split into paragraphs
dfp = dff['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})
dfp.index.names = OHCO[:3]
dfp['para_str'] = dfp['para_str'].str.replace(r'\n', ' ', regex=True).str.strip()
dfp = dfp[~dfp['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [8]:
# Remove footnote labels
footnote_regex = r'^\[Footnote \d+: _.*--(Ver)?(\.)?\s?\d*([-\d]*)\.'
dfp['para_str'] = dfp['para_str'].str.replace(footnote_regex, '', regex=True).str.strip()
dfp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book_num,fable_num,para_num,Unnamed: 3_level_1
1,1,0,God reduces Chaos into order. He separates the...
1,1,1,"At first, the sea, the earth, and the heaven, ..."
1,1,2,To this discord God and bounteous Nature[8] pu...
1,1,3,This is very similar to the words of the Scrip...
1,1,4,"Titan. The Sun is so called, on account of his..."
...,...,...,...
15,5,18,EXPLANATION.
15,5,19,"The Poet having fulfilled his promise, and hav..."
15,5,20,"The Romans, who deduced their origin from Ænea..."
15,5,21,The sorrow of the Gods and of nature at the un...


In [9]:
# Split into sentences
dfs = dfp['para_str'].str.split(r'[.?!;:"“”]+', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})
dfs.index.names = OHCO[:4]
dfs = dfs[~dfs['sent_str'].str.match(r'^\s*$')] # Remove empty sentences
dfs['sent_str'] = dfs['sent_str'].str.replace(r'[{}]', '', regex=True) # Remove intext braces

dfs['sent_str'] = dfs['sent_str'].str.replace(r'\[\d+\]', '', regex=True) # Remove footnote indexes
dfs['sent_str'] = dfs['sent_str'].str.replace(r'[\[\]]', '', regex=True) # Remove intext brackets
dfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book_num,fable_num,para_num,sent_num,Unnamed: 4_level_1
1,1,0,0,God reduces Chaos into order
1,1,0,1,"He separates the four elements, and disposes ..."
1,1,1,0,"At first, the sea, the earth, and the heaven, ..."
1,1,1,1,"a rude and undigested mass, and nothing more ..."
1,1,1,2,No Sun as yet gave light to the world
...,...,...,...,...
15,5,21,6,"but the priests would not permit it, and had ..."
15,5,21,7,"Dio Cassius says, that the Roman people raise..."
15,5,21,8,"Suetonius says, that a pillar was also erecte..."
15,5,21,9,"He adds, that he was made a Divinity by a pub..."


In [10]:
dfs.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book_num,fable_num,para_num,sent_num,Unnamed: 4_level_1
3,8,34,2,The first was the son of Jupiter and Proserpina
1,2,18,1,
15,4,4,2,It is sensible of the weight of the God
7,7,6,1,Dost thou inquire what was the fortune of the ...
2,1,7,21,"or if thou hast a mind capable of change, mak..."
4,1,7,20,"and imprinting kisses on his cold features, s..."
3,7,2,0,"While he is drinking, being attracted with the..."
8,4,6,4,Their very multitude is a hindrance to those ...
3,8,4,28,"This Libys, this the yellow-haired Melanthus,..."
8,1,4,12,Would that the Gods would grant I might be wi...


In [11]:
%%time
df = dfs.sent_str.apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x)), dtype=object))\
    .stack().to_frame().rename(columns={0:'pos_tuple'})
df['pos'] = df.pos_tuple.apply(lambda x: x[1])
df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
df.index.names = OHCO

Wall time: 18.9 s


In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str
book_num,fable_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,0,"(God, NNP)",NNP,God
1,1,0,0,1,"(reduces, VBZ)",VBZ,reduces
1,1,0,0,2,"(Chaos, NNP)",NNP,Chaos
1,1,0,0,3,"(into, IN)",IN,into
1,1,0,0,4,"(order, NN)",NN,order
...,...,...,...,...,...,...,...
15,5,21,9,19,"(at, IN)",IN,at
15,5,21,9,20,"(what, WP)",WP,what
15,5,21,9,21,"(time, NN)",NN,time
15,5,22,0,0,"(THE, DT)",DT,THE


In [13]:
# Save TOKEN table
df.to_csv('data_out/TOKEN.csv')
LIB.to_csv('data_out/LIB.csv')

In [14]:
# Extract vocabulary from TOKEN table
TOKEN = pd.read_csv('data_out/TOKEN.csv', index_col=OHCO)
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '', regex=True)
VOCAB = TOKEN.term_str.value_counts().to_frame()\
    .rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')
VOCAB

Unnamed: 0_level_0,term_str,n,num
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,31063,0
1,1,2,1
2,1000,1,1
3,109,1,1
4,11,3,1
...,...,...,...
14907,ὤπς,1,0
14908,ὦ,1,0
14909,ὦτα,1,0
14910,ὦτον,1,0


In [15]:
# Annotate VOCAB table
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [16]:
# Add Stems
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.term_str.apply(stemmer1.stem)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.term_str.apply(stemmer2.stem)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.term_str.apply(stemmer3.stem)

In [17]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,stop,stem_porter,stem_snowball,stem_lancaster
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2823,conscious,10,0,0,consciou,conscious,conscy
5101,fed,5,0,0,fed,fed,fed
11648,shaped,1,0,0,shape,shape,shap
9754,platonic,1,0,0,platon,platon,platon
6720,incontinence,1,0,0,incontin,incontin,incontin
3535,demisso,1,0,0,demisso,demisso,demisso
14027,vermin,1,0,0,vermin,vermin,vermin
14895,ὁ,1,0,0,ὁ,ὁ,ὁ
13659,unaccustomed,2,0,0,unaccustom,unaccustom,unaccustom
1231,avows,1,0,0,avow,avow,avow


In [18]:
# Add pos_max
M = TOKEN.groupby(['term_str','pos']).pos.count().unstack(fill_value=0)
VOCAB = VOCAB.reset_index().set_index('term_str')
VOCAB['pos_max'] = M.idxmax(1)
VOCAB = VOCAB.reset_index().set_index('term_id')

In [19]:
# Save TOKEN, VOCAB tables
TOKEN.to_csv('data_out/TOKEN.csv')
VOCAB.to_csv('data_out/VOCAB.csv')