# Metadata

```
Course:   DS 5001 
Module:   02 Homework KEY
```

# Set Up

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
sns.set()

In [3]:
ls ../../notebooks/labs/data

ls: ../../notebooks/labs/data: No such file or directory


In [4]:
data_home = "../data"
# data_home = '../../notebooks/labs/data'

In [5]:
text_file = f"{data_home}/gutenberg/pg161.txt" 
csv_file1 = f"{data_home}/output/austen-sense.csv" # To be created
csv_file2 = f"{data_home}/output/austen-persuasion.csv" # Already created
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [6]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.strip()

In [7]:
LINES.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
1796,
13003,
3922,unsuitableness which often existed between hus...
5346,Charlotte do? I warrant you she is a fine siz...
4067,came in without any eclat. She merely observe...
2445,two of her daughters went with her; but Marian...
1587,
5320,"any enquiry after his rival; and at length, by..."
12762,
3123,that I must have been intended by nature to be...


# Extract title of work from first line

In [8]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')

In [9]:
title

'\ufeffSense and Sensibility, by Jane Austen'

In [10]:
LINES.head()

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
0,﻿The Project Gutenberg EBook of Sense and Sens...
1,
2,This eBook is for the use of anyone anywhere a...
3,almost no restrictions whatsoever. You may co...
4,re-use it under the terms of the Project Guten...


# Remove Gutenberg's front and back matter

In [11]:
a = LINES.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = LINES.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [12]:
an = LINES.loc[a].index[0]
bn = LINES.loc[b].index[0]

In [13]:
LINES = LINES.loc[an + 1 : bn - 2]

In [14]:
LINES

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
19,
20,Special thanks are due to Sharon Partridge for...
21,proofreading and correction of this etext.
22,
23,
...,...
12662,
12663,
12664,
12665,


# Chunk by chapter

## Find all chapter headers

In [15]:
chap_lines = LINES.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [16]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
42,CHAPTER 1
196,CHAPTER 2
399,CHAPTER 3
562,CHAPTER 4
757,CHAPTER 5
859,CHAPTER 6
987,CHAPTER 7
1113,CHAPTER 8
1245,CHAPTER 9
1449,CHAPTER 10


## Assign numbers to chapters

In [17]:
chap_nums = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]

In [18]:
LINES.loc[chap_lines, 'chap_num'] = chap_nums

In [19]:
LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]

## Forward-fill chapter numbers to following text lines

In [20]:
LINES.chap_num = LINES.chap_num.ffill()

## Clean up

**DID NOT REMOVE EMPTY LINES**

In [21]:
# LINES = LINES.loc[~LINES.chap_num.isna()] # Remove chapter heading lines
LINES = LINES.dropna(subset=['chap_num'])
LINES = LINES.loc[~chap_lines] # Remove everything before Chapter 1
LINES.chap_num = LINES.chap_num.astype('int') # Convert chap_num from float to int

In [22]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
9420,and I feel the goodness of Colonel Brandon mos...,40
8834,,37
185,could receive her sister-in-law on her arrival...,1
94,"and, as a mark of his affection for the three ...",1
8927,,38
2381,Elinor.,14
11560,"minute in every particular of speech and look,...",46
9385,he did not suppose it possible that Delaford l...,39
7883,"attractions. You would not think it perhaps, ...",34
7585,appearance to think his acquaintance worth hav...,33


## Group lines by chapter num 

In [23]:
CHAPS = LINES.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame('chap_str')

In [24]:
CHAPS.head()

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."


# Split into paragraphs 

In [25]:
PARAS = CHAPS['chap_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame('para_str')
PARAS.index.names = OHCO[:2] 

In [26]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [27]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True).str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [28]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."
1,5,His son was sent for as soon as his danger was...


In [29]:
PARAS

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."
1,5,His son was sent for as soon as his danger was...
...,...,...
50,19,"For Marianne, however--in spite of his incivil..."
50,20,Mrs. Dashwood was prudent enough to remain at ...
50,21,"Between Barton and Delaford, there was that co..."
50,22,THE END


# Split into sentences

NOTE: ADDED `"` to regex in `split()`

In [30]:
SENTS = PARAS['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})
SENTS.index.names = OHCO[:3]
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
SENTS.sent_str = SENTS.sent_str.str.strip()

In [31]:
SENTS.sent_str.str.match(r"^\s*$").sum()

0

In [32]:
SENTS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,The family of Dashwood had long been settled i...
1,1,1,"Their estate was large, and their residence wa..."
1,1,2,The late owner of this estate was a single man...
1,1,3,"But her death, which happened ten years before..."
1,1,4,"for to supply her loss, he invited and receive..."


In [33]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,The family of Dashwood had long been settled i...
1,1,1,"Their estate was large, and their residence wa..."
1,1,2,The late owner of this estate was a single man...
1,1,3,"But her death, which happened ten years before..."
1,1,4,"for to supply her loss, he invited and receive..."
...,...,...,...
50,20,3,"Jennings, when Marianne was taken from them, M..."
50,21,0,"Between Barton and Delaford, there was that co..."
50,21,1,--and among the merits and the happiness of El...
50,22,0,THE END


# Split into tokens

In [34]:
TOKENS = SENTS['sent_str'].str.split(r"[\s',-]+", expand=True).stack()\
    .to_frame('token_str')
TOKENS.index.names = OHCO[:4]

In [35]:
TOKENS['term_str'] = TOKENS.token_str.str.replace(r"[\W_]+", '', regex=True).str.lower()

In [36]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0,0,The,the
1,1,0,1,family,family
1,1,0,2,of,of
1,1,0,3,Dashwood,dashwood
1,1,0,4,had,had
...,...,...,...,...,...
50,23,0,8,and,and
50,23,0,9,Sensibility,sensibility
50,23,0,10,by,by
50,23,0,11,Jane,jane


# Save work to CSV

In [37]:
TOKENS.to_csv(csv_file1)

# Combine the two into a Corpus

In [38]:
csv_file2 = f"{data_home}/output/austen-persuasion.csv"

In [39]:
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

In [40]:
len(df1), len(df2)

(122257, 85014)

In [41]:
df1['book_id'] = 1 # They may use the string for the titles here
df2['book_id'] = 2

In [42]:
LIB = {1:'Sense & Sensibility', 2:'Persuasion'}

In [43]:
CORPUS = pd.concat([df1, df2])

In [44]:
OHCO2 = ['book_id'] + OHCO

In [45]:
CORPUS = CORPUS.set_index(OHCO2)

In [46]:
# CORPUS.sample(10)

In [47]:
len(CORPUS), CORPUS.shape[0], CORPUS.token_str.count()

(207271, 207271, 205601)

# Extract a vocabulary $V$

In [48]:
CORPUS['term_str'] = CORPUS.token_str.str.replace(r"\W+", "", regex=True).str.lower()
V = CORPUS.term_str.value_counts().to_frame('n')
V.index.name = 'term_str'
V['n_chars'] = V.index.str.len()

In [49]:
len(V)

8237

In [50]:
V.n_chars.mean()

7.553842418356198

# Save Combo

In [51]:
CORPUS.to_csv(f'{data_home}/output/austen-combo.csv')

# Q1

How many tokens in the CORPUS?

In [52]:
CORPUS.shape[0]

207271

# Q2

How many distinct terms are there in the combined data frame (i.e. how big is the vocabulary)?  

In [53]:
V.shape[0]

8237

In [54]:
V

Unnamed: 0_level_0,n,n_chars
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7436,3
to,6924,2
and,6290,3
of,6145,2
her,3747,3
...,...,...
publicly,1,8
distractedly,1,12
remembrances,1,12
varnished,1,9


# Q3

How many more terms does the vocabulary of Sense and Sensibility have than that of Persuasion?  

In [55]:
V['in_1'] = CORPUS.loc[1].term_str.value_counts()
V['in_2'] = CORPUS.loc[2].term_str.value_counts()

In [56]:
LIB

{1: 'Sense & Sensibility', 2: 'Persuasion'}

In [57]:
V.count()

n          8237
n_chars    8237
in_1       6278
in_2       5760
dtype: int64

In [58]:
N = V.count().to_frame('n').T

In [59]:
N['in_1'] - N['in_2']

n    518
dtype: int64

In [60]:
OHCO2[:2]

['book_id', 'chap_num']

# Q4 

What is the average number of tokens, rounded to an integer, per chapter in the corpus?  

In [65]:
CORPUS.groupby(OHCO2[:2]).term_str.count().mean()

2778.391891891892

In [76]:
import numpy as np

In [74]:
new = CORPUS.reset_index().groupby(OHCO2[:2]).agg({"token_num":"count"})

In [78]:
new = CORPUS.reset_index().groupby(OHCO2[:2]).agg({"token_str":"count"})

In [80]:
np.mean(new.token_str)

2778.391891891892

# Q5
... per paragraph in the corpus?  

In [66]:
CORPUS.groupby(OHCO2[:4]).term_str.count().mean()

14.5630400906644