# Module 2: Code Exercise

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [10]:
import pandas as pd

In [11]:
import requests

In [14]:
url = 'http://www.gutenberg.org/cache/epub/161/pg161.txt'
r = requests.get(url)
with open('pg161.txt', 'w', encoding='utf-8-sig') as infile:
    infile.write(r.text)

In [15]:
# epub_file = "../MOD01--SetUp/pg105.txt"
epub_file = "pg161.txt"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [16]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()

In [17]:
df = pd.DataFrame(epub, columns=['line_str'])

In [18]:
df.index.name = 'line_num'

In [19]:
df.line_str = df.line_str.str.strip()

In [20]:
df.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
566,"taste for drawing."""
6200,it be told as soon as possible. My feelings a...
11394,borrow of Colonel Brandon. By reading only si...
3156,
9878,"have been beyond comparison,' she said, 'the l..."
3925,
2065,What a blow upon them all was this!
11182,"made me acquainted with his earnest, tender, c..."
9529,"knowing her to be acquainted with it; which, w..."
6648,endure it. Her heart was hardened against the...


# Extract title of work from first line

In [21]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [22]:
print(title)

﻿Sense and Sensibility, by Jane Austen


In [23]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,﻿The Project Gutenberg EBook of Sense and Sens...,"﻿Sense and Sensibility, by Jane Austen"
1,,"﻿Sense and Sensibility, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"﻿Sense and Sensibility, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"﻿Sense and Sensibility, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"﻿Sense and Sensibility, by Jane Austen"


# Remove Gutenberg's front and back matter

In [24]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [25]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [26]:
df = df.loc[an + 1 : bn - 2]

In [27]:
df

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,"﻿Sense and Sensibility, by Jane Austen"
20,Special thanks are due to Sharon Partridge for...,"﻿Sense and Sensibility, by Jane Austen"
21,proofreading and correction of this etext.,"﻿Sense and Sensibility, by Jane Austen"
22,,"﻿Sense and Sensibility, by Jane Austen"
23,,"﻿Sense and Sensibility, by Jane Austen"
24,,"﻿Sense and Sensibility, by Jane Austen"
25,,"﻿Sense and Sensibility, by Jane Austen"
26,,"﻿Sense and Sensibility, by Jane Austen"
27,,"﻿Sense and Sensibility, by Jane Austen"
28,,"﻿Sense and Sensibility, by Jane Austen"


# Chunk by chapter

## Find all chapter headers

In [28]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [29]:
df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
42,CHAPTER 1,"﻿Sense and Sensibility, by Jane Austen"
196,CHAPTER 2,"﻿Sense and Sensibility, by Jane Austen"
399,CHAPTER 3,"﻿Sense and Sensibility, by Jane Austen"
562,CHAPTER 4,"﻿Sense and Sensibility, by Jane Austen"
757,CHAPTER 5,"﻿Sense and Sensibility, by Jane Austen"
859,CHAPTER 6,"﻿Sense and Sensibility, by Jane Austen"
987,CHAPTER 7,"﻿Sense and Sensibility, by Jane Austen"
1113,CHAPTER 8,"﻿Sense and Sensibility, by Jane Austen"
1245,CHAPTER 9,"﻿Sense and Sensibility, by Jane Austen"
1449,CHAPTER 10,"﻿Sense and Sensibility, by Jane Austen"


## Assign numbers to chapters

In [30]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]

In [31]:
df.loc[chap_lines, 'chap_num'] = chap_nums

In [32]:
# df

## Forward-fill chapter numbers to following text lines

In [33]:
df.chap_num = df.chap_num.ffill()

In [34]:
# df.head(100)

## Clean up

In [35]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

In [36]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4663,Middleton's head; and therefore very little le...,"﻿Sense and Sensibility, by Jane Austen",23
6632,From a night of more sleep than she had expect...,"﻿Sense and Sensibility, by Jane Austen",31
3823,besides it is such a way off. I know why you ...,"﻿Sense and Sensibility, by Jane Austen",20
8069,,"﻿Sense and Sensibility, by Jane Austen",35
7954,,"﻿Sense and Sensibility, by Jane Austen",35
7593,"visiting HER, which, to say the truth, has bee...","﻿Sense and Sensibility, by Jane Austen",33
9013,"out, how he had been sent for Wednesday to Har...","﻿Sense and Sensibility, by Jane Austen",38
11915,"Mrs. Dashwood, however, conforming, as she tru...","﻿Sense and Sensibility, by Jane Austen",48
7846,,"﻿Sense and Sensibility, by Jane Austen",34
7383,"sisters to see her. His manners to THEM, thou...","﻿Sense and Sensibility, by Jane Austen",33


## Group by chapter num and reset dataframe 

In [37]:
df = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [38]:
df.head()

Unnamed: 0_level_0,line_str
chap_num,Unnamed: 1_level_1
1,\n\nThe family of Dashwood had long been settl...
2,\n\nMrs. John Dashwood now installed herself m...
3,\n\nMrs. Dashwood remained at Norland several ...
4,"\n\n""What a pity it is, Elinor,"" said Marianne..."
5,"\n\nNo sooner was her answer dispatched, than ..."


# Split into paragraphs 

In [39]:
df = df['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})

In [40]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [41]:
df.index.names = OHCO[:2]

In [42]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."


In [43]:
df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [44]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,The family of Dashwood had long been settled i...
1,2,"By a former marriage, Mr. Henry Dashwood had o..."
1,3,"The old gentleman died: his will was read, and..."
1,4,"Mr. Dashwood's disappointment was, at first, s..."
1,5,His son was sent for as soon as his danger was...


# Split into sentences

In [45]:
df = df['para_str'].str.split(r'[.?!;]', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})

In [46]:
df.index.names = OHCO[:3]

In [47]:
df = df[~df['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [48]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,The family of Dashwood had long been settled i...
1,1,1,"Their estate was large, and their residence ..."
1,1,2,The late owner of this estate was a single m...
1,1,3,"But her death, which happened ten years befo..."
1,1,4,"for to supply her loss, he invited and receiv..."


# Split into tokens

In [49]:
df = df['sent_str'].str.split(r'\s+', expand=True).stack()\
    .to_frame().rename(columns={0:'token_str'})

In [50]:
df.index.names = OHCO[:4]

In [51]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,The
1,1,0,1,family
1,1,0,2,of
1,1,0,3,Dashwood
1,1,0,4,had
1,1,0,5,long
1,1,0,6,been
1,1,0,7,settled
1,1,0,8,in
1,1,0,9,Sussex
