# Module 2: Text into Data: Importing a Text

* DS 5001: Exploratory Text Analytics
* Raf Alvarado

# Set Up

In [1]:
import pandas as pd

In [2]:
epub_file = "../MOD01--SetUp/pg105.txt"
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

# Import file into a dataframe

In [3]:
epub = open(epub_file, 'r', encoding='utf-8-sig').readlines()

In [4]:
df = pd.DataFrame(epub, columns=['line_str'])

In [5]:
df.index.name = 'line_num'

In [6]:
df.line_str = df.line_str.str.strip()

In [7]:
df.sample(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
7865,
4715,If Elizabeth could but have heard this! Such ...
3367,"anything! I am afraid of her, as I have told ..."
7310,misconstructions of the most mischievous kind.
1771,to be about the child. My being the mother is...
2023,"in unison, no countenances so beloved. Now th..."
8157,"Wentworth, some moments of communications cont..."
3777,Captain Wentworth now hurried off to get every...
8594,defect in this electronic work within 90 days ...
7004,"affected carelessness, ""but he gave so many hi..."


# Extract title of work from first line

In [8]:
title = df.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
df['title'] = title

In [9]:
print(title)

In [11]:
df.head()

Unnamed: 0_level_0,line_str,title
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"The Project Gutenberg EBook of Persuasion, by ...","Persuasion, by Jane Austen"
1,,"Persuasion, by Jane Austen"
2,This eBook is for the use of anyone anywhere a...,"Persuasion, by Jane Austen"
3,almost no restrictions whatsoever. You may co...,"Persuasion, by Jane Austen"
4,re-use it under the terms of the Project Guten...,"Persuasion, by Jane Austen"


# Remove Gutenberg's front and back matter

In [16]:
a = df.line_str.str.match(r"\*\*\*\s*START OF (THE|THIS) PROJECT")
b = df.line_str.str.match(r"\*\*\*\s*END OF (THE|THIS) PROJECT")

In [17]:
an = df.loc[a].index[0]
bn = df.loc[b].index[0]

In [20]:
df = df.iloc[an + 1 : bn - 2]

# Chunk by chapter

## Find all chapter headers

In [31]:
chap_lines = df.line_str.str.match(r"^\s*(chapter|letter)\s+(\d+)", case=False)

In [33]:
# df.loc[chap_lines]

## Assign numbers to chapters

In [35]:
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]

In [37]:
df.loc[chap_lines, 'chap_num'] = chap_nums

In [39]:
# df.loc[chap_lines]

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47,Chapter 1,"Persuasion, by Jane Austen",1.0
306,Chapter 2,"Persuasion, by Jane Austen",2.0
500,Chapter 3,"Persuasion, by Jane Austen",3.0
786,Chapter 4,"Persuasion, by Jane Austen",4.0
959,Chapter 5,"Persuasion, by Jane Austen",5.0
1297,Chapter 6,"Persuasion, by Jane Austen",6.0
1657,Chapter 7,"Persuasion, by Jane Austen",7.0
1992,Chapter 8,"Persuasion, by Jane Austen",8.0
2346,Chapter 9,"Persuasion, by Jane Austen",9.0
2632,Chapter 10,"Persuasion, by Jane Austen",10.0


## Forward-fill chapter numbers to following text lines

In [40]:
df.chap_num = df.chap_num.ffill()

In [42]:
df.head()

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19,,"Persuasion, by Jane Austen",
20,,"Persuasion, by Jane Austen",
21,,"Persuasion, by Jane Austen",
22,,"Persuasion, by Jane Austen",
23,Produced by Sharon Partridge and Martin Ward. ...,"Persuasion, by Jane Austen",


## Clean up

In [43]:
df = df.loc[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df.chap_num = df.chap_num.astype('int') # Convert chap_num from float to int

In [45]:
df.sample(10)

Unnamed: 0_level_0,line_str,title,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6226,Room was strikingly great. Why was it? She t...,"Persuasion, by Jane Austen",20
478,Elizabeth deserved it. She had never received...,"Persuasion, by Jane Austen",2
645,"It seemed as if Mr Shepherd, in this anxiety t...","Persuasion, by Jane Austen",3
8145,"and happiness, and more generally admired than...","Persuasion, by Jane Austen",23
6520,been doubting and considering as to what I oug...,"Persuasion, by Jane Austen",21
7825,,"Persuasion, by Jane Austen",23
5467,"ordered to walk to keep off the gout, and Mrs ...","Persuasion, by Jane Austen",18
7228,"horses, and with her own complete independence...","Persuasion, by Jane Austen",22
5747,understand that Lady Dalrymple was calling to ...,"Persuasion, by Jane Austen",19
1393,quite different creatures with you! But to be...,"Persuasion, by Jane Austen",6


## Group by chapter num and reset dataframe 

In [None]:
df = df.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string

In [None]:
df.head()

# Split into paragraphs 

In [None]:
df = df['line_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame().rename(columns={0:'para_str'})
df.index.names = OHCO[:2]
df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [None]:
df.head()

# Split into sentences

In [None]:
df = df['para_str'].str.split(r'[.?!;]', expand=True).stack()\
    .to_frame().rename(columns={0:'sent_str'})
df.index.names = OHCO[:3]
df = df[~df['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs

In [None]:
df.head()

In [None]:
X_sk = df.values

In [None]:
X_gs = df.values.tolist()

In [None]:
X_sk

# Regrouping