# Module 2: Text into Data: Importing a Text

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [1]:
import pandas as pd

In [2]:
epub_file = "../MOD01--SetUp/pg105.txt"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [3]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()

In [4]:
df = pd.DataFrame(epub, columns=['line_str'])

In [5]:
df.index.name = 'line_num'

In [6]:
df.line_str = df.line_str.str.strip()

In [7]:
df.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
3927,had little difficulty; it was soon determined ...
2824,"say? No, I have no idea of being so easily pe..."
7408,without her. Mrs Musgrove interposed.
8211,Who can be in doubt of what followed? When an...
2,This eBook is for the use of anyone anywhere a...
5733,his own arch significance as he named her; but...
2971,"though becoming attached to another, still he ..."
1624,
3043,now could do any good; and were Lady Russell t...
4273,"""Yes, that he will!"" exclaimed Mary, tauntingl..."


# Extract title of work from first line

In [8]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [9]:
print(title)

Persuasion, by Jane Austen


In [10]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"The Project Gutenberg EBook of Persuasion, by ...","Persuasion, by Jane Austen"
1,,"Persuasion, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"Persuasion, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"Persuasion, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"Persuasion, by Jane Austen"


# Remove Gutenberg's front and back matter

In [11]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [12]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [13]:
df = df.loc[an + 1 : bn - 2]

In [15]:
df

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,"Persuasion, by Jane Austen"
20,,"Persuasion, by Jane Austen"
21,,"Persuasion, by Jane Austen"
22,,"Persuasion, by Jane Austen"
23,Produced by Sharon Partridge and Martin Ward. ...,"Persuasion, by Jane Austen"
24,by Al Haines.,"Persuasion, by Jane Austen"
25,,"Persuasion, by Jane Austen"
26,,"Persuasion, by Jane Austen"
27,,"Persuasion, by Jane Austen"
28,,"Persuasion, by Jane Austen"


# Chunk by chapter

## Find all chapter headers

In [16]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [17]:
df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
47,Chapter 1,"Persuasion, by Jane Austen"
306,Chapter 2,"Persuasion, by Jane Austen"
500,Chapter 3,"Persuasion, by Jane Austen"
786,Chapter 4,"Persuasion, by Jane Austen"
959,Chapter 5,"Persuasion, by Jane Austen"
1297,Chapter 6,"Persuasion, by Jane Austen"
1657,Chapter 7,"Persuasion, by Jane Austen"
1992,Chapter 8,"Persuasion, by Jane Austen"
2346,Chapter 9,"Persuasion, by Jane Austen"
2632,Chapter 10,"Persuasion, by Jane Austen"


## Assign numbers to chapters

In [18]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]

In [21]:
df.loc[chap_lines, 'chap_num'] = chap_nums

In [24]:
# df

## Forward-fill chapter numbers to following text lines

In [25]:
df.chap_num = df.chap_num.ffill()

In [28]:
# df.head(100)

## Clean up

In [29]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

In [30]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7327,began talking very warmly about the family fea...,"Persuasion, by Jane Austen",22
3476,,"Persuasion, by Jane Austen",12
8155,"and Mrs Croft, everything of peculiar cordiali...","Persuasion, by Jane Austen",23
2744,fraught with the apt analogy of the declining ...,"Persuasion, by Jane Austen",10
5014,ill; and she had been particularly fortunate i...,"Persuasion, by Jane Austen",17
7803,,"Persuasion, by Jane Austen",23
796,"extremely pretty girl, with gentleness, modest...","Persuasion, by Jane Austen",4
7276,"rather claimed as part of the family; and, in ...","Persuasion, by Jane Austen",22
7373,"so forgetful?""","Persuasion, by Jane Austen",22
168,"as ever, amidst the wreck of the good looks of...","Persuasion, by Jane Austen",1


## Group by chapter num and reset dataframe 

In [38]:
df = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [39]:
df.head()

Unnamed: 0_level_0,line_str
chap_num,Unnamed: 1_level_1
1,"\n\nSir Walter Elliot, of Kellynch Hall, in So..."
2,"\n\nMr Shepherd, a civil, cautious lawyer, who..."
3,"\n\n""I must take leave to observe, Sir Walter,..."
4,"\n\nHe was not Mr Wentworth, the former curate..."
5,\n\nOn the morning appointed for Admiral and M...


# Split into paragraphs 

In [40]:
df = df['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})

In [42]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,
1,1,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,2,"""ELLIOT OF KELLYNCH HALL."
1,3,"""Walter Elliot, born March 1, 1760, married, J..."
1,4,Precisely such had the paragraph originally st...


In [43]:
df.index.names = OHCO[:2]

In [44]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,
1,1,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,2,"""ELLIOT OF KELLYNCH HALL."
1,3,"""Walter Elliot, born March 1, 1760, married, J..."
1,4,Precisely such had the paragraph originally st...


In [45]:
df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [46]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,2,"""ELLIOT OF KELLYNCH HALL."
1,3,"""Walter Elliot, born March 1, 1760, married, J..."
1,4,Precisely such had the paragraph originally st...
1,5,Then followed the history and rise of the anci...


# Split into sentences

In [None]:
df = df['para_str'].str.split(r'[.?!;]', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})

In [52]:
df.index.names = OHCO[:3]

In [53]:
df = df[~df['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [54]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,1,1,"there he found occupation for an idle hour, a..."
1,1,2,there his faculties were roused into admirati...
1,1,3,"there any unwelcome sensations, arising from ..."
1,1,4,"and there, if every other leaf were powerless..."


# Split into tokens

In [56]:
df = df['sent_str'].str.split(r'\s+', expand=True).stack()\
    .to_frame().rename(columns={0:'token_str'})

In [58]:
df.index.names = OHCO[:4]

In [59]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,1,0,0,Sir
1,1,0,1,Walter
1,1,0,2,"Elliot,"
1,1,0,3,of
1,1,0,4,Kellynch
1,1,0,5,"Hall,"
1,1,0,6,in
1,1,0,7,"Somersetshire,"
1,1,0,8,was
1,1,0,9,a
