# Module 1: Text into Data: Importing a Text

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [57]:
import pandas as pd

In [58]:
epub_file = '/home/rca2t/Public/ETA/data/epubs/AUSTEN/AUSTEN_JANE_PERSUASION-pg105.txt'
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [59]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()
df = pd.DataFrame(epub, columns=['line_str'])
df.index.name = 'line_num'
df.line_str = df.line_str.str.strip()

In [60]:
df.head()

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
0,"The Project Gutenberg EBook of Persuasion, by ..."
1,
2,This eBook is for the use of anyone anywhere a...
3,almost no restrictions whatsoever. You may co...
4,re-use it under the terms of the Project Guten...


# Extract title of work from first line

In [61]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [62]:
print(title)

Persuasion, by Jane Austen


# Remove Gutenberg's front and back matter

In [63]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [65]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]
df = df.iloc[an + 1 : bn - 2]

# Chunk by chapter

## Find all chapter headers

In [67]:
chap_lines = df.line_str.str.match(r"\s*(chapter|letter)", case=False)

## Assign numbers to chapters

In [67]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
df.loc[chap_lines, 'chap_num'] = chap_nums

## Forward-fill chapter numbers to following text lines

In [67]:
df.chap_num = df.chap_num.ffill()

## Clean up

In [67]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

## Group by chapter num and reset dataframe 

In [69]:
df = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [70]:
df.head()

Unnamed: 0_level_0,line_str
chap_num,Unnamed: 1_level_1
1,"\n\nSir Walter Elliot, of Kellynch Hall, in So..."
2,"\n\nMr Shepherd, a civil, cautious lawyer, who..."
3,"\n\n""I must take leave to observe, Sir Walter,..."
4,"\n\nHe was not Mr Wentworth, the former curate..."
5,\n\nOn the morning appointed for Admiral and M...


# Split into paragraphs 

In [71]:
df = df['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})
df.index.names = OHCO[:2]
df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [73]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,1,1,"there he found occupation for an idle hour, a..."
1,1,2,there his faculties were roused into admirati...
1,1,3,"there any unwelcome sensations, arising from ..."
1,1,4,"and there, if every other leaf were powerless..."


# Split into sentences

In [72]:
df = df['para_str'].str.split(r'[.?!;]', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})
df.index.names = OHCO[:3]
df = df[~df['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [74]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,"Sir Walter Elliot, of Kellynch Hall, in Somers..."
1,1,1,"there he found occupation for an idle hour, a..."
1,1,2,there his faculties were roused into admirati...
1,1,3,"there any unwelcome sensations, arising from ..."
1,1,4,"and there, if every other leaf were powerless..."


In [81]:
X_sk = df.values

In [83]:
X_gs = df.values.tolist()

In [84]:
X_sk

array([['Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage'],
       [' there he found occupation for an idle hour, and consolation in a distressed one'],
       [' there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents'],
       ...,
       ['  His profession was all that could ever make her friends wish that tenderness less, the dread of a future war all that could dim her sunshine'],
       ["  She gloried in being a sailor's wife, but she must pay the tax of quick alarm for belonging to that profession which is, if possible, more distinguished in its domestic virtues than in its national importance"],
       ['Finis']], dtype=object)

# Regrouping