## Set Up

## Import libraries

In [56]:
import pandas as pd


### Import Config

In [57]:
import configparser

config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [58]:
data_home, output_dir

('/Users/jamessiegener/MSDS/DS5001/data',
 '/Users/jamessiegener/MSDS/DS5001/output')

In [59]:
text_file = f"{data_home}/gutenberg/pg161.txt"
csv_file = f"{output_dir}/austen-combo-TOKENS.csv"

In [60]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

## Import file into a dataframe

In [61]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()
LINES.sample(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
40,
6082,"""Exert yourself, dear Marianne,"" she cried, ""i..."
3867,too.-- I assure you it was a great compliment ...
4064,
72,"property, could be but small. Their mother ha..."
7530,people of common prudence will do THAT; and wh...
754,
11312,"brought, by their united request, to consider ..."
11107,her own.
5523,"Colonel Brandon, who had a general invitation ..."


## Extract Title

In [62]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
title

'Sense and Sensibility, by Jane Austen'

## Clip the Cruft

In [63]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [64]:
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

In [65]:
line_a = LINES.loc[pat_a].index[0] + 1
line_b = LINES.loc[pat_b].index[0] - 1

In [66]:
line_a, line_b

(20, 12666)

In [67]:
LINES = LINES.loc[line_a: line_b]

In [68]:
LINES.head(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,


In [69]:
LINES.tail(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12657,
12658,
12659,
12660,
12661,
12662,
12663,
12664,
12665,End of the Project Gutenberg EBook of Sense an...
12666,


## Chunk by Chapter

### Find all chapter headers

In [70]:
chap_pat = r"^\s*(?:chapter|letter)\s+\d+"

In [71]:
chap_lines = LINES.line_str.str.match(chap_pat, case=False)

In [72]:
LINES.loc[chap_lines] 

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
561,CHAPTER 4
756,CHAPTER 5
858,CHAPTER 6
986,CHAPTER 7
1112,CHAPTER 8
1244,CHAPTER 9
1448,CHAPTER 10


### Assign numbers to chapters

In [73]:
LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]

In [74]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,1.0
196,CHAPTER 2,2.0
399,CHAPTER 3,3.0
561,CHAPTER 4,4.0
756,CHAPTER 5,5.0
858,CHAPTER 6,6.0
986,CHAPTER 7,7.0
1112,CHAPTER 8,8.0
1244,CHAPTER 9,9.0
1448,CHAPTER 10,10.0


In [75]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
3003,"""About eighteen hundred or two thousand a year...",
4780,"""Offended me! How could you suppose so? Believ...",
9696,"himself, he thinks that nobody else can marry ...",
4930,"more for the happiness of both of you,' I shou...",
7025,she could not bring herself to speak of what s...,
10597,me before she ought to have done it. But she ...,
10599,,
597,the world of his goodness and sense. I think ...,
3995,"whatever her ladyship was doing, if she happen...",
9089,,


In [76]:
LINES.chap_num = LINES.chap_num.ffill()

In [77]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
9568,,40.0
5900,"longer than while she spoke, and was immediate...",29.0
12533,marrying privately without his mother's consen...,50.0
8828,"you have a regard for, Mrs. Jennings. We all ...",37.0
7994,it was so. They all looked exceedingly foolis...,35.0
8740,"affair, and bring them news of his wife.",37.0
10312,"servant with a message to Mr. Harris, and an o...",43.0
11450,"expose me to""--",46.0
92,"all the value of all the attention which, for ...",1.0
3619,,19.0


In [78]:
LINES.head(20)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
20,,
21,,
22,,
23,,
24,,
25,,
26,,
27,,
28,,
29,,


In [79]:
LINES = LINES.dropna(subset=['chap_num']) 
LINES = LINES.loc[~chap_lines] 
LINES.chap_num = LINES.chap_num.astype('int')

In [80]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
9894,"left to improve her acquaintance with Robert, ...",41
9722,"own all their obligation to her, and openly de...",41
11861,place in which so much conspired to give her a...,48
12177,,49
8906,"consolation, beyond the consciousness of doing...",38
10240,"attended her every day, still talked boldly of...",43
5933,"Willoughby, which appeared to her a very good ...",29
3283,alone--and tomorrow you must absolutely dine w...,18
4937,"much for an indifferent person.""",24
8050,should be checked by Lucy's unwelcome presence...,35


In [81]:
CHAPS = LINES.groupby(OHCO[:1])\
    .line_str.apply(lambda x: '\n'.join(x))\
    .to_frame('chap_str')

In [82]:
CHAPS.head(10)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."
6,\n\nThe first part of their journey was perfor...
7,\n\nBarton Park was about half a mile from the...
8,\n\nMrs. Jennings was a widow with an ample jo...
9,\n\nThe Dashwoods were now settled at Barton w...
10,"\n\nMarianne's preserver, as Margaret, with mo..."


In [83]:
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

In [84]:
CHAPS

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,The family of Dashwood had long been settled i...
2,Mrs. John Dashwood now installed herself mistr...
3,Mrs. Dashwood remained at Norland several mont...
4,"""What a pity it is, Elinor,"" said Marianne, ""t..."
5,"No sooner was her answer dispatched, than Mrs...."
6,The first part of their journey was performed ...
7,Barton Park was about half a mile from the cot...
8,Mrs. Jennings was a widow with an ample jointu...
9,The Dashwoods were now settled at Barton with ...
10,"Marianne's preserver, as Margaret, with more e..."


## Split chapters into paragraphs

In [85]:
para_pat = r'\n\n+'

In [86]:
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[:2]

In [87]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


In [88]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

In [89]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,The family of Dashwood had long been settled i...
1,1,"By a former marriage, Mr. Henry Dashwood had o..."
1,2,"The old gentleman died: his will was read, and..."
1,3,"Mr. Dashwood's disappointment was, at first, s..."
1,4,His son was sent for as soon as his danger was...


## Split paragraphs into sentences


In [90]:
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack()\
    .to_frame('sent_str')
SENTS.index.names = OHCO[:3]

In [91]:
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] 
SENTS.sent_str = SENTS.sent_str.str.strip()

In [92]:
SENTS.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,The family of Dashwood had long been settled i...
1,0,1,"Their estate was large, and their residence wa..."
1,0,2,The late owner of this estate was a single man...
1,0,3,"But her death, which happened ten years before..."
1,0,4,"for to supply her loss, he invited and receive..."
1,0,5,"Henry Dashwood, the legal inheritor of the Nor..."
1,0,6,"In the society of his nephew and niece, and th..."
1,0,7,His attachment to them all increased
1,0,8,The constant attention of Mr
1,0,9,and Mrs


In [93]:
SENTS.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
41,19,5,Ferrars can never forget that Edward is her son
13,10,1,"It came from town, and is merely a letter of b..."
28,16,4,"Absence might have weakened his regard, and co..."
33,40,1,Jennings
37,20,1,""""
27,11,1,he came to look at Marianne and talk to Elinor...
22,34,2,"She returned it almost instantly, acknowledgin..."
37,26,3,"Now, I can think and speak of it with little e..."
40,50,1,""" cried Mrs"
29,68,4,""""


## Split sentences into tokens

In [113]:
token_pat = r"[\s',-]+"
TOKENS_ss = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')
TOKENS_ss

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,The
1,0,0,1,family
1,0,0,2,of
1,0,0,3,Dashwood
1,0,0,4,had
...,...,...,...,...
50,22,0,8,and
50,22,0,9,Sensibility
50,22,0,10,by
50,22,0,11,Jane


In [95]:
TOKENS_ss.index.names = OHCO[:4]

In [96]:
TOKENS_ss

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,The
1,0,0,1,family
1,0,0,2,of
1,0,0,3,Dashwood
1,0,0,4,had
...,...,...,...,...
50,22,0,8,and
50,22,0,9,Sensibility
50,22,0,10,by
50,22,0,11,Jane


## Import Persuasion and Combine 

In [97]:
TOKENS_p = pd.read_csv(f"{data_home}/austen-persuasion.csv", index_col=OHCO)
TOKENS_p

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,Sir,sir
1,0,0,1,Walter,walter
1,0,0,2,Elliot,elliot
1,0,0,3,of,of
1,0,0,4,Kellynch,kellynch
...,...,...,...,...,...
24,13,0,6,of,of
24,13,0,7,Persuasion,persuasion
24,13,0,8,by,by
24,13,0,9,Jane,jane


In [98]:
TOKENS = pd.concat([TOKENS_ss,TOKENS_p], axis = 0, keys = [0,1], names = ['book'])

###  1. How many raw tokens are in the combined data frame?

In [99]:
TOKENS.shape

(207896, 2)

There are 207896 raw tokens in the combined data frame.

### 2. How many distinct terms are there in the combined data frame (i.e. how big is the vocabulary)?

### Extract Vocabulary

In [100]:
TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB = TOKENS.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [101]:
VOCAB

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,7435
1,to,6923
2,and,6290
3,of,6146
4,her,3747
...,...,...
8234,unconquerable,1
8235,outgrown,1
8236,prosperously,1
8237,nominal,1


In [102]:
VOCAB.shape

(8239, 2)

There are 8,239 distinct terms in the combined data frame.

### 3. How many more terms does the vocabulary of Sense and Sensibility have than that of Persuasion?

In [103]:
TOKENS_ss['term_str'] = TOKENS_ss.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB_ss = TOKENS_ss.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_ss.index.name = 'term_id'
VOCAB_ss

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,to,4115
1,the,4105
2,of,3574
3,and,3490
4,her,2543
...,...,...
6275,prefer,1
6276,dissolving,1
6277,beset,1
6278,effectually,1


In [104]:
TOKENS_p['term_str'] = TOKENS_p.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB_p = TOKENS_p.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB_p.index.name = 'term_id'
VOCAB_p

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,3330
1,to,2808
2,and,2800
3,of,2572
4,a,1595
...,...,...
5755,reins,1
5756,judiciously,1
5757,rut,1
5758,dung,1


In [105]:
(VOCAB_ss.shape[0]) - (VOCAB_p.shape[0])

520

Sense and Sensibility has 520 more unique terms than Persuasion.

### 4. What is the average number of tokens, rounded to an integer, per chapter in the corpus?

In [106]:
def gather(ohco_level):
    global TOKENS
    level_name = OHCO[ohco_level-1].split('_')[0]
    df = TOKENS.groupby(OHCO[:ohco_level])\
        .token_str.apply(lambda x: x.str.cat(sep=' '))\
        .to_frame(f"{level_name}_str")
    return df

In [107]:
TOKENS.groupby(['book','chap_num']).count().mean()

token_str    2807.918919
term_str     2807.918919
dtype: float64

The average number of tokens per chapter in the corpus is 2808

### 5. What is the average number of tokens, rounded to an integer, per paragraph in the corpus?

In [108]:
TOKENS.groupby(['book','chap_num','para_num']).count().mean()

token_str    73.709117
term_str     73.709117
dtype: float64

The average number of tokens per paragraph in the corpus is 74

In [109]:
TOKENS.to_csv(csv_file)

In [110]:
TOKENS_ss.to_csv(f"{output_dir}/austen-sense-and-sensibility.csv")