In [1]:
import pandas as pd
import numpy as np

In [2]:
OHCO = ['book','chap_num', 'para_num', 'sent_num', 'token_num', 'term_str']

In [3]:
# data_home = "/Users/omertoker/DS 5001"
# text_file = f"{data_home}/Module 2/pg161.txt"
# csv_file = f"{data_home}/Module 2/ds5001-omer.csv"

In [4]:
text_file = "../data/gutenberg/pg161.txt"
csv_file = "omer.csv"

Import File Into a DataFrame

In [5]:
lines = pd.DataFrame(open(text_file, 'r', encoding = 'utf-8-sig').readlines(), columns = ['line_str'])
lines.index.name = 'line_num'
lines.line_str = lines.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

In [6]:
lines.sample(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
8346,"down at any time, and collect a few friends ab..."
7338,attentive.
6296,
7180,
279,about three thousand pounds on their mother's ...
3414,"every thing."""
4967,for the sake of seeing Edward. He will be the...
7332,to be her brother.
3119,"""My judgment,"" he returned, ""is all on your si..."
10136,entirely escaped the latter lady's observation...


Extract Title

In [7]:
title = lines.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')

In [8]:
print(title)

﻿Sense and Sensibility, by Jane Austen


Clip Cruft

In [9]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT", 
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [10]:
pat_a = lines.line_str.str.match(clip_pats[0])
pat_b = lines.line_str.str.match(clip_pats[1])

In [11]:
line_a = lines.loc[pat_a].index[0] + 1
line_b = lines.loc[pat_b].index[0] - 1

In [12]:
line_a, line_b

(19, 12667)

In [13]:
lines = lines.loc[line_a : line_b]

In [14]:
lines.head(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
19,
20,Special thanks are due to Sharon Partridge for...
21,proofreading and correction of this etext.
22,
23,
24,
25,
26,
27,
28,


In [15]:
lines.tail(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
12658,
12659,
12660,
12661,
12662,
12663,
12664,
12665,
12666,End of the Project Gutenberg EBook of Sense an...
12667,


In [16]:
lines

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
19,
20,Special thanks are due to Sharon Partridge for...
21,proofreading and correction of this etext.
22,
23,
...,...
12663,
12664,
12665,
12666,End of the Project Gutenberg EBook of Sense an...


Chunk by Chapter

Finding All Chapter Headers

In [17]:
chap_pat = r"^\s*(?:chapter|letter)\s+\d+"

In [18]:
chap_lines = lines.line_str.str.match(chap_pat, case = False)

In [19]:
lines.loc[chap_lines]

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
562,CHAPTER 4
757,CHAPTER 5
859,CHAPTER 6
987,CHAPTER 7
1113,CHAPTER 8
1245,CHAPTER 9
1449,CHAPTER 10


Assigning Numbers to Chapters

In [20]:
lines.loc[chap_lines, 'chap_num'] = [i+1 for i in range(lines.loc[chap_lines].shape[0])]

In [21]:
lines.loc[chap_lines]

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,1.0
196,CHAPTER 2,2.0
399,CHAPTER 3,3.0
562,CHAPTER 4,4.0
757,CHAPTER 5,5.0
859,CHAPTER 6,6.0
987,CHAPTER 7,7.0
1113,CHAPTER 8,8.0
1245,CHAPTER 9,9.0
1449,CHAPTER 10,10.0


In [22]:
lines.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
12011,of the happiest of men. His situation indeed ...,
4654,,
853,"of the pleasure or the regret you occasion, an...",
8056,"her eyes were filled with tears as she spoke, ...",
8414,"Fanny, rejoicing in her escape, and proud of t...",
10234,She knew not that she had been the means of se...,
9493,"""And so YOU are forced to do it. Well THAT is...",
8925,"still fancied present exertion impossible, and...",
8067,"The sight of you, Edward, is the only comfort ...",
9230,"When she told Marianne what she had done, howe...",


Forward-Fill Chapter Numbers to Following Text Lines

In [23]:
lines.chap_num = lines.chap_num.ffill()

In [24]:
lines.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
8435,"Middleton, and made an entry into the close he...",36.0
4498,accidentally obtained; it might not have been ...,22.0
9125,"she felt assured that Lucy, for the sake of he...",38.0
3126,"""Marianne has not shyness to excuse any inatte...",17.0
5111,Marianne's countenance sunk.,25.0
711,,4.0
11394,borrow of Colonel Brandon. By reading only si...,46.0
10858,"If you CAN pity me, Miss Dashwood, pity my sit...",44.0
1948,"""Oh! pray, Miss Margaret, let us know all abou...",12.0
1949,"Jennings. ""What is the gentleman's name?""",12.0


In [25]:
lines.head(20)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,
20,Special thanks are due to Sharon Partridge for...,
21,proofreading and correction of this etext.,
22,,
23,,
24,,
25,,
26,,
27,,
28,,


In [26]:
lines = lines.dropna(subset = ['chap_num'])
lines = lines.loc[~chap_lines]
lines.chap_num = lines.chap_num.astype('int')

In [27]:
lines.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
11678,,47
4763,"safely, under the shelter of its noise, introd...",23
368,my engagement by such acts of assistance and k...,2
290,"while she lives, rather than for them--somethi...",2
111,family; but he was affected by a recommendatio...,1
6436,"""Dear ma'am, this kindness is quite unnecessar...",30
6999,"from her sister, he put an end to his visit, r...",31
8780,"make him put an end to the engagement, assiste...",37
419,"daughters' sake with satisfaction, though as f...",3
11718,"prohibited a subject, but conclude him to be s...",47


In [28]:
lines.head(20)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
43,,1
44,,1
45,The family of Dashwood had long been settled i...,1
46,"was large, and their residence was at Norland ...",1
47,"their property, where, for many generations, t...",1
48,respectable a manner as to engage the general ...,1
49,surrounding acquaintance. The late owner of t...,1
50,"man, who lived to a very advanced age, and who...",1
51,"life, had a constant companion and housekeeper...",1
52,"death, which happened ten years before his own...",1


Group Lines Into Chapters

In [29]:
OHCO[:1]

['book']

In [30]:
lines['book'] = "Sense and Sensibility"

In [31]:
chaps = lines.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame('chap_str')

In [32]:
chaps.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,chap_str
book,chap_num,Unnamed: 2_level_1
Sense and Sensibility,1,\n\nThe family of Dashwood had long been settl...
Sense and Sensibility,2,\n\nMrs. John Dashwood now installed herself m...
Sense and Sensibility,3,\n\nMrs. Dashwood remained at Norland several ...
Sense and Sensibility,4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
Sense and Sensibility,5,"\n\nNo sooner was her answer dispatched, than ..."
Sense and Sensibility,6,\n\nThe first part of their journey was perfor...
Sense and Sensibility,7,\n\nBarton Park was about half a mile from the...
Sense and Sensibility,8,\n\nMrs. Jennings was a widow with an ample jo...
Sense and Sensibility,9,\n\nThe Dashwoods were now settled at Barton w...
Sense and Sensibility,10,"\n\nMarianne's preserver, as Margaret, with mo..."


Chaps has all chapters grouped into a single row

In [33]:
chaps['chap_str'] = chaps.chap_str.str.strip() #Cleaning the whitespace in chaps

In [34]:
chaps.head(10) 

Unnamed: 0_level_0,Unnamed: 1_level_0,chap_str
book,chap_num,Unnamed: 2_level_1
Sense and Sensibility,1,The family of Dashwood had long been settled i...
Sense and Sensibility,2,Mrs. John Dashwood now installed herself mistr...
Sense and Sensibility,3,Mrs. Dashwood remained at Norland several mont...
Sense and Sensibility,4,"""What a pity it is, Elinor,"" said Marianne, ""t..."
Sense and Sensibility,5,"No sooner was her answer dispatched, than Mrs...."
Sense and Sensibility,6,The first part of their journey was performed ...
Sense and Sensibility,7,Barton Park was about half a mile from the cot...
Sense and Sensibility,8,Mrs. Jennings was a widow with an ample jointu...
Sense and Sensibility,9,The Dashwoods were now settled at Barton with ...
Sense and Sensibility,10,"Marianne's preserver, as Margaret, with more e..."


Split Chapters Into Parapraphs

In [35]:
para_pat = r'\n\n+'

In [36]:
paras = chaps['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
paras.index.names = OHCO[:3]

In [37]:
paras.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book,chap_num,para_num,Unnamed: 3_level_1
Sense and Sensibility,1,0,The family of Dashwood had long been settled i...
Sense and Sensibility,1,1,"By a former marriage, Mr. Henry Dashwood had o..."
Sense and Sensibility,1,2,"The old gentleman died: his will was read, and..."
Sense and Sensibility,1,3,"Mr. Dashwood's disappointment was, at first, s..."
Sense and Sensibility,1,4,His son was sent for as soon as his danger was...


In [38]:
paras['para_str'] = paras['para_str'].str.replace(r'\n', ' ', regex=True)
paras['para_str'] = paras['para_str'].str.strip()
paras = paras[~paras['para_str'].str.match(r'^\s*$')]
#Removing empty paragraphs

In [39]:
paras.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book,chap_num,para_num,Unnamed: 3_level_1
Sense and Sensibility,1,0,The family of Dashwood had long been settled i...
Sense and Sensibility,1,1,"By a former marriage, Mr. Henry Dashwood had o..."
Sense and Sensibility,1,2,"The old gentleman died: his will was read, and..."
Sense and Sensibility,1,3,"Mr. Dashwood's disappointment was, at first, s..."
Sense and Sensibility,1,4,His son was sent for as soon as his danger was...


In [40]:
paras

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book,chap_num,para_num,Unnamed: 3_level_1
Sense and Sensibility,1,0,The family of Dashwood had long been settled i...
Sense and Sensibility,1,1,"By a former marriage, Mr. Henry Dashwood had o..."
Sense and Sensibility,1,2,"The old gentleman died: his will was read, and..."
Sense and Sensibility,1,3,"Mr. Dashwood's disappointment was, at first, s..."
Sense and Sensibility,1,4,His son was sent for as soon as his danger was...
Sense and Sensibility,...,...,...
Sense and Sensibility,50,18,"For Marianne, however--in spite of his incivil..."
Sense and Sensibility,50,19,Mrs. Dashwood was prudent enough to remain at ...
Sense and Sensibility,50,20,"Between Barton and Delaford, there was that co..."
Sense and Sensibility,50,21,THE END


Split Paragraphs Into Sentences

In [41]:
sent_pat = r'[.?!;:]+'

In [42]:
sents = paras['para_str'].str.split(sent_pat, expand = True).stack()\
    .to_frame('sent_str')

In [43]:
sents.sent_str.str.match(r"^\s*$").sum()

774

In [44]:
sents.index.names = OHCO[:4]

In [45]:
sents.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book,chap_num,para_num,sent_num,Unnamed: 4_level_1
Sense and Sensibility,1,0,0,The family of Dashwood had long been settled i...
Sense and Sensibility,1,0,1,"Their estate was large, and their residence ..."
Sense and Sensibility,1,0,2,The late owner of this estate was a single m...
Sense and Sensibility,1,0,3,"But her death, which happened ten years befo..."
Sense and Sensibility,1,0,4,"for to supply her loss, he invited and receiv..."


In [46]:
sents

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book,chap_num,para_num,sent_num,Unnamed: 4_level_1
Sense and Sensibility,1,0,0,The family of Dashwood had long been settled i...
Sense and Sensibility,1,0,1,"Their estate was large, and their residence ..."
Sense and Sensibility,1,0,2,The late owner of this estate was a single m...
Sense and Sensibility,1,0,3,"But her death, which happened ten years befo..."
Sense and Sensibility,1,0,4,"for to supply her loss, he invited and receiv..."
Sense and Sensibility,...,...,...,...
Sense and Sensibility,50,20,0,"Between Barton and Delaford, there was that co..."
Sense and Sensibility,50,20,1,--and among the merits and the happiness of El...
Sense and Sensibility,50,20,2,
Sense and Sensibility,50,21,0,THE END


Splitting Sentences Into Tokens

In [47]:
token_pat = r"[\s',-]+"

In [48]:
tokens = sents['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')

In [49]:
tokens.index.names = OHCO[:5]

In [50]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
Sense and Sensibility,1,0,0,0,The
Sense and Sensibility,1,0,0,1,family
Sense and Sensibility,1,0,0,2,of
Sense and Sensibility,1,0,0,3,Dashwood
Sense and Sensibility,1,0,0,4,had
Sense and Sensibility,...,...,...,...,...
Sense and Sensibility,50,22,0,8,and
Sense and Sensibility,50,22,0,9,Sensibility
Sense and Sensibility,50,22,0,10,by
Sense and Sensibility,50,22,0,11,Jane


In [51]:
tokens['term_str']  = tokens['token_str'].str.lower()

In [52]:
# RCA2T
tokens.query("term_str == ''")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
Sense and Sensibility,1,0,1,0,,
Sense and Sensibility,1,0,2,0,,
Sense and Sensibility,1,0,3,0,,
Sense and Sensibility,1,0,4,0,,
Sense and Sensibility,1,0,5,0,,
Sense and Sensibility,...,...,...,...,...,...
Sense and Sensibility,50,19,2,0,,
Sense and Sensibility,50,19,3,0,,
Sense and Sensibility,50,19,4,0,,
Sense and Sensibility,50,20,1,0,,


Importing CSV into Pandas

In [53]:
# persuasion = pd.read_csv (r'/Users/omertoker/DS 5001/Module 2/austen-persuasion.csv')

In [54]:
persuasion = pd.read_csv('../data/output/austen-persuasion.csv')

In [55]:
persuasion

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str
0,1,0,0,0,Sir,sir
1,1,0,0,1,Walter,walter
2,1,0,0,2,Elliot,elliot
3,1,0,0,3,of,of
4,1,0,0,4,Kellynch,kellynch
...,...,...,...,...,...,...
85009,24,13,0,6,of,of
85010,24,13,0,7,Persuasion,persuasion
85011,24,13,0,8,by,by
85012,24,13,0,9,Jane,jane


Combining DataFrames

In [56]:
persuasion['book'] = 'persuasion'

In [57]:
persuasion

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,term_str,book
0,1,0,0,0,Sir,sir,persuasion
1,1,0,0,1,Walter,walter,persuasion
2,1,0,0,2,Elliot,elliot,persuasion
3,1,0,0,3,of,of,persuasion
4,1,0,0,4,Kellynch,kellynch,persuasion
...,...,...,...,...,...,...,...
85009,24,13,0,6,of,of,persuasion
85010,24,13,0,7,Persuasion,persuasion,persuasion
85011,24,13,0,8,by,by,persuasion
85012,24,13,0,9,Jane,jane,persuasion


In [58]:
tokens.set_index

<bound method DataFrame.set_index of                                                               token_str  \
book                  chap_num para_num sent_num token_num                
Sense and Sensibility 1        0        0        0                  The   
                                                 1               family   
                                                 2                   of   
                                                 3             Dashwood   
                                                 4                  had   
...                                                                 ...   
                      50       22       0        8                  and   
                                                 9          Sensibility   
                                                 10                  by   
                                                 11                Jane   
                                                 12            

In [59]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
Sense and Sensibility,1,0,0,0,The,the
Sense and Sensibility,1,0,0,1,family,family
Sense and Sensibility,1,0,0,2,of,of
Sense and Sensibility,1,0,0,3,Dashwood,dashwood
Sense and Sensibility,1,0,0,4,had,had
Sense and Sensibility,...,...,...,...,...,...
Sense and Sensibility,50,22,0,8,and,and
Sense and Sensibility,50,22,0,9,Sensibility,sensibility
Sense and Sensibility,50,22,0,10,by,by
Sense and Sensibility,50,22,0,11,Jane,jane


In [60]:
persuasion = persuasion.reindex(columns = OHCO)

In [61]:
persuasion

Unnamed: 0,book,chap_num,para_num,sent_num,token_num,term_str
0,persuasion,1,0,0,0,sir
1,persuasion,1,0,0,1,walter
2,persuasion,1,0,0,2,elliot
3,persuasion,1,0,0,3,of
4,persuasion,1,0,0,4,kellynch
...,...,...,...,...,...,...
85009,persuasion,24,13,0,6,of
85010,persuasion,24,13,0,7,persuasion
85011,persuasion,24,13,0,8,by
85012,persuasion,24,13,0,9,jane


In [62]:
tokens['index'] = np.arange(len(tokens))

In [63]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,index
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sense and Sensibility,1,0,0,0,The,the,0
Sense and Sensibility,1,0,0,1,family,family,1
Sense and Sensibility,1,0,0,2,of,of,2
Sense and Sensibility,1,0,0,3,Dashwood,dashwood,3
Sense and Sensibility,1,0,0,4,had,had,4
Sense and Sensibility,...,...,...,...,...,...,...
Sense and Sensibility,50,22,0,8,and,and,128760
Sense and Sensibility,50,22,0,9,Sensibility,sensibility,128761
Sense and Sensibility,50,22,0,10,by,by,128762
Sense and Sensibility,50,22,0,11,Jane,jane,128763


In [64]:
persuasion['index'] = np.arange(len(persuasion))

In [65]:
persuasion.set_index('index')

Unnamed: 0_level_0,book,chap_num,para_num,sent_num,token_num,term_str
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,persuasion,1,0,0,0,sir
1,persuasion,1,0,0,1,walter
2,persuasion,1,0,0,2,elliot
3,persuasion,1,0,0,3,of
4,persuasion,1,0,0,4,kellynch
...,...,...,...,...,...,...
85009,persuasion,24,13,0,6,of
85010,persuasion,24,13,0,7,persuasion
85011,persuasion,24,13,0,8,by
85012,persuasion,24,13,0,9,jane


In [66]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,index
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sense and Sensibility,1,0,0,0,The,the,0
Sense and Sensibility,1,0,0,1,family,family,1
Sense and Sensibility,1,0,0,2,of,of,2
Sense and Sensibility,1,0,0,3,Dashwood,dashwood,3
Sense and Sensibility,1,0,0,4,had,had,4
Sense and Sensibility,...,...,...,...,...,...,...
Sense and Sensibility,50,22,0,8,and,and,128760
Sense and Sensibility,50,22,0,9,Sensibility,sensibility,128761
Sense and Sensibility,50,22,0,10,by,by,128762
Sense and Sensibility,50,22,0,11,Jane,jane,128763


In [67]:
OHCO_2 = ['index','book','chap_num', 'para_num', 'sent_num', 'token_num', 'term_str']

In [68]:
persuasion.reindex(columns = OHCO_2)

Unnamed: 0,index,book,chap_num,para_num,sent_num,token_num,term_str
0,0,persuasion,1,0,0,0,sir
1,1,persuasion,1,0,0,1,walter
2,2,persuasion,1,0,0,2,elliot
3,3,persuasion,1,0,0,3,of
4,4,persuasion,1,0,0,4,kellynch
...,...,...,...,...,...,...,...
85009,85009,persuasion,24,13,0,6,of
85010,85010,persuasion,24,13,0,7,persuasion
85011,85011,persuasion,24,13,0,8,by
85012,85012,persuasion,24,13,0,9,jane


In [69]:
persuasion

Unnamed: 0,book,chap_num,para_num,sent_num,token_num,term_str,index
0,persuasion,1,0,0,0,sir,0
1,persuasion,1,0,0,1,walter,1
2,persuasion,1,0,0,2,elliot,2
3,persuasion,1,0,0,3,of,3
4,persuasion,1,0,0,4,kellynch,4
...,...,...,...,...,...,...,...
85009,persuasion,24,13,0,6,of,85009
85010,persuasion,24,13,0,7,persuasion,85010
85011,persuasion,24,13,0,8,by,85011
85012,persuasion,24,13,0,9,jane,85012


In [70]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,index
book,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sense and Sensibility,1,0,0,0,The,the,0
Sense and Sensibility,1,0,0,1,family,family,1
Sense and Sensibility,1,0,0,2,of,of,2
Sense and Sensibility,1,0,0,3,Dashwood,dashwood,3
Sense and Sensibility,1,0,0,4,had,had,4
Sense and Sensibility,...,...,...,...,...,...,...
Sense and Sensibility,50,22,0,8,and,and,128760
Sense and Sensibility,50,22,0,9,Sensibility,sensibility,128761
Sense and Sensibility,50,22,0,10,by,by,128762
Sense and Sensibility,50,22,0,11,Jane,jane,128763


In [73]:
tokens.to_csv(csv_file)

In [74]:
sands = pd.read_csv(csv_file)

In [75]:
sands

Unnamed: 0,book,chap_num,para_num,sent_num,token_num,token_str,term_str,index
0,Sense and Sensibility,1,0,0,0,The,the,0
1,Sense and Sensibility,1,0,0,1,family,family,1
2,Sense and Sensibility,1,0,0,2,of,of,2
3,Sense and Sensibility,1,0,0,3,Dashwood,dashwood,3
4,Sense and Sensibility,1,0,0,4,had,had,4
...,...,...,...,...,...,...,...,...
128760,Sense and Sensibility,50,22,0,8,and,and,128760
128761,Sense and Sensibility,50,22,0,9,Sensibility,sensibility,128761
128762,Sense and Sensibility,50,22,0,10,by,by,128762
128763,Sense and Sensibility,50,22,0,11,Jane,jane,128763


In [78]:
final = pd.concat([persuasion, sands])

In [79]:
final

Unnamed: 0,book,chap_num,para_num,sent_num,token_num,term_str,index,token_str
0,persuasion,1,0,0,0,sir,0,
1,persuasion,1,0,0,1,walter,1,
2,persuasion,1,0,0,2,elliot,2,
3,persuasion,1,0,0,3,of,3,
4,persuasion,1,0,0,4,kellynch,4,
...,...,...,...,...,...,...,...,...
128760,Sense and Sensibility,50,22,0,8,and,128760,and
128761,Sense and Sensibility,50,22,0,9,sensibility,128761,Sensibility
128762,Sense and Sensibility,50,22,0,10,by,128762,by
128763,Sense and Sensibility,50,22,0,11,jane,128763,Jane


In [80]:
final.shape

(213779, 8)

In [81]:
final = final.drop(columns="token_str")

In [82]:
final = final.drop(columns="index")

In [83]:
final

Unnamed: 0,book,chap_num,para_num,sent_num,token_num,term_str
0,persuasion,1,0,0,0,sir
1,persuasion,1,0,0,1,walter
2,persuasion,1,0,0,2,elliot
3,persuasion,1,0,0,3,of
4,persuasion,1,0,0,4,kellynch
...,...,...,...,...,...,...
128760,Sense and Sensibility,50,22,0,8,and
128761,Sense and Sensibility,50,22,0,9,sensibility
128762,Sense and Sensibility,50,22,0,10,by
128763,Sense and Sensibility,50,22,0,11,jane


In [84]:
vocab_final = final.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})

In [85]:
vocab_final.index.name = 'term_id'

In [86]:
vocab_final

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,7424
1,to,6901
2,and,6224
3,of,6139
4,her,3742
...,...,...
8495,errors,1
8496,confide,1
8497,undesignedly,1
8498,pecuniary,1


In [87]:
vocab_persuasion = persuasion.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})

In [88]:
vocab_persuasion.index.name = 'term_id'

In [89]:
vocab_persuasion

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,3330
1,to,2808
2,and,2800
3,of,2572
4,a,1595
...,...,...
5754,idly,1
5755,humbler,1
5756,profit,1
5757,torn,1


In [90]:
vocab_sands = sands.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})

In [91]:
vocab_sands.index.name = 'term_id'

In [92]:
vocab_sands

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,4094
1,to,4093
2,of,3567
3,and,3424
4,her,2538
...,...,...
6535,showers,1
6536,basis,1
6537,captivate,1
6538,dagger,1


In [93]:
6540 - 5759

781

In [94]:
final['chap_num'].value_counts(sort=False).mean()

4275.58

In [95]:
final['para_num'].value_counts(sort=False).mean()

2095.872549019608