# Synpsis

Use case: Import source text and save in F3 form.

# Configuration

In [1]:
extra_stopwords = """
us rest went least would much must long one like much say well without though yet might still upon
done every rather particular made many previous always never thy thou go first oh thee ere ye came
almost could may sometimes seem called among another also however nevertheless even way one two three
ever put
""".strip().split()

In [2]:
set(extra_stopwords)

{'almost',
 'also',
 'always',
 'among',
 'another',
 'called',
 'came',
 'could',
 'done',
 'ere',
 'even',
 'ever',
 'every',
 'first',
 'go',
 'however',
 'least',
 'like',
 'long',
 'made',
 'many',
 'may',
 'might',
 'much',
 'must',
 'never',
 'nevertheless',
 'oh',
 'one',
 'particular',
 'previous',
 'put',
 'rather',
 'rest',
 'say',
 'seem',
 'sometimes',
 'still',
 'thee',
 'thou',
 'though',
 'three',
 'thy',
 'two',
 'upon',
 'us',
 'way',
 'well',
 'went',
 'without',
 'would',
 'ye',
 'yet'}

In [1]:
OHCO = ['book_num','chap_num','para_num', 'sent_num', 'token_num']
BOOKS = OHCO[:1]
CHAPS = OHCO[:2]
PARAS = OHCO[:3]
SENTS = OHCO[:4]
BOOKS = OHCO[:5]

# Libraries

In [2]:
import re
import os
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lukek\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\lukek\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#pd.options.display.max_rows = 4000

# Pragmas

In [6]:
%matplotlib inline

# Process

We pause to look at the revised form of our text import function. The parsing function has been replaced with NLTK, which has improved the results of POS tagging. However, this has required some added string manipulation to produce better tokens.

In [7]:
db_file = 'HarryPotter.db'
body_start_all = [4, 6, 5, 134, 38, 33, 90]
body_end_all = [10700, 6874, 14652, 6508, 9329, 5895, 17726]
para_pat_all = [r'\n\n+',r'\n\n+',r'\n\n+',r'\n\n+',r'\n',r'\n',r'\n\n+']
src_file_name_all = ['1Sorcerers Stone.txt','2The Chamber of Secrets.txt', '3Prisoner of Azkaban.txt', '4The Goblet of Fire.txt','5The Order of the Phoenix.txt','6The Half Blood Prince.txt','7The Deathly Hallows.txt']

# Chapters

In [8]:
book = pd.DataFrame()
for i in range(len(body_start_all)):
    body_start = body_start_all[i]
    body_end = body_end_all[i]
    para_pat = para_pat_all[i]
    src_file_name = src_file_name_all[i]
    chap_pat = r'^\s*(?:Chapter|ETYMOLOGY|Epilogue|C H|CHAPTER).*$'
    sent_pat = r'([.;?!"“”]+)'
    token_pat = r'([\W_]+)'
    #
    lines = open(src_file_name, 'r', encoding='utf-8', errors='ignore').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    del(lines)
    df.line_str = df.line_str.str.replace('—', ' — ')
    df.line_str = df.line_str.str.replace('-', ' - ')
    #
    chap_mask = df.line_str.str.match(chap_pat)
    df.loc[chap_mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    #
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
    .apply(lambda x: ''.join(x.line_str))\
    .to_frame()\
    .rename(columns={0:'chap_str'})
    del(df)
    book = book.append(chaps)
    del(chaps)

## Text to lines

In [9]:
book

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
0,CHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mr...
1,CHAPTER TWO\n\nTHE VANISHING GLASS\n\nNearly t...
2,CHAPTER THREE\n\nTHE LETTERS FROM NO ONE\n\nTh...
3,CHAPTER FOUR\n\nTHE KEEPER OF THE KEYS\n\nBOOM...
4,CHAPTER FIVE\n\nDIAGON ALLEY\n\nHarry woke ear...
5,CHAPTER SIX\n\nTHE JOURNEY FROM PLATFORM NINE ...
6,CHAPTER SEVEN\n\nTHE SORTING HAT\n\nThe door s...
7,"CHAPTER EIGHT\n\nTHE POTIONS MASTER\n\nThere, ..."
8,CHAPTER NINE\n\nTHE MIDNIGHT DUEL\n\nHarry had...
9,CHAPTER TEN\n\nHALLOWEEN\n\nMalfoy couldn't be...


In [10]:
book = book.reset_index()

In [11]:
book.head()

Unnamed: 0,chap_num,chap_str
0,0,CHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mr...
1,1,CHAPTER TWO\n\nTHE VANISHING GLASS\n\nNearly t...
2,2,CHAPTER THREE\n\nTHE LETTERS FROM NO ONE\n\nTh...
3,3,CHAPTER FOUR\n\nTHE KEEPER OF THE KEYS\n\nBOOM...
4,4,CHAPTER FIVE\n\nDIAGON ALLEY\n\nHarry woke ear...


# Add book_num

In [12]:
book_mask = book.chap_num == 0

In [13]:
book.loc[book_mask, 'book_id'] = book.apply(lambda x: x.name, 1)
book.book_id = book.book_id.ffill().astype('int')
book_ids = book.book_id.unique().tolist()
book['book_num'] = book.book_id.apply(lambda x: book_ids.index(x))
book = book.groupby(['book_num','chap_num'])\
    .apply(lambda x: ''.join(x.chap_str))\
    .to_frame()\
    .rename(columns={0:'chap_str'})

In [14]:
book.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chap_str
book_num,chap_num,Unnamed: 2_level_1
0,0,CHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mr...
0,1,CHAPTER TWO\n\nTHE VANISHING GLASS\n\nNearly t...
0,2,CHAPTER THREE\n\nTHE LETTERS FROM NO ONE\n\nTh...
0,3,CHAPTER FOUR\n\nTHE KEEPER OF THE KEYS\n\nBOOM...
0,4,CHAPTER FIVE\n\nDIAGON ALLEY\n\nHarry woke ear...


## Chapters to Paragraphs

In [15]:
chaps = book.copy()

In [16]:
paras = chaps.chap_str.str.split(para_pat, expand=True)\
    .stack()\
    .to_frame()\
    .rename(columns={0:'para_str'})
paras.index.names = PARAS
paras.para_str = paras.para_str.str.strip()
paras.para_str = paras.para_str.str.replace(r'\n', ' ')
paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
paras = paras[~paras.para_str.str.match(r'^\s*$')]
paras = paras[~paras.para_str.str.match('.[0-9]+.')]

In [17]:
paras.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,para_str
book_num,chap_num,para_num,Unnamed: 3_level_1
0,0,0,CHAPTER ONE
0,0,1,THE BOY WHO LIVED
0,0,2,"Mr. and Mrs. Dursley, of number four, Privet D..."
0,0,3,Mr. Dursley was the director of a firm called ...
0,0,4,"The Dursleys had everything they wanted, but t..."


## Paragraphs to Sentences

In [18]:
#     sents = paras.para_str.str.split(sent_pat, expand=True)\
sents = paras.para_str\
    .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'sent_str'})
sents.index.names = SENTS
# del(paras)

In [19]:
sents.shape

(88439, 1)

In [20]:
sents.tail(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book_num,chap_num,para_num,sent_num,Unnamed: 4_level_1
6,36,2,31,Draco Malfoy was standing there with his wife ...
6,36,2,32,"His hair was receding somewhat, which emphasiz..."
6,36,2,33,The new boy resembled Draco as much as Albus r...
6,36,2,34,"Draco caught sight of Harry, Ron, Hermione, an..."
6,36,2,35,"""So that's little Scorpius,"" said Ron under hi..."
6,36,2,36,"""Make sure you beat him in every test, Rosie."
6,36,2,37,"Thank God you inherited your mother's brains."""
6,36,2,38,"""Ron, for heaven's sake,"" said Hermione, half ..."
6,36,2,39,"""Don't try to turn them against each other bef..."
6,36,2,40,"""You're right, sorry,"" said Ron, but unable to..."


## Sentences to Tokens with POS tagging

In [21]:
#     tokens = sents.sent_str.str.split(token_pat, expand=True)\
tokens = sents.sent_str\
    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
    .stack()\
    .to_frame()\
    .rename(columns={0:'pos_tuple'})
tokens.index.names = OHCO
tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
tokens = tokens.drop('pos_tuple', 1)
# del(sents)

In [22]:
tokens.shape

(1395395, 2)

In [23]:
tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,token_str
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,0,0,NN,CHAPTER
0,0,0,0,1,CD,ONE
0,0,1,0,0,DT,THE
0,0,1,0,1,NNP,BOY
0,0,1,0,2,NNP,WHO
0,0,1,0,3,NNP,LIVED
0,0,2,0,0,NNP,Mr.
0,0,2,0,1,CC,and
0,0,2,0,2,NNP,Mrs.
0,0,2,0,3,NNP,Dursley


## Tag punctuation and numbers

In [24]:
tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
tokens['num'] = tokens.token_str.str.match(r'^.*\d.*$').astype('int')

## Extract vocab with minimal normalization

In [25]:
WORDS = (tokens.punc == 0) & (tokens.num == 0)
tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()\
    .str.replace(r'["_*.]', '')
vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
    .reset_index()\
    .rename(columns={'index':'term_str', 'term_str':'n'})
vocab = vocab.sort_values('term_str').reset_index(drop=True)
vocab.index.name = 'term_id'

In [26]:
vocab.shape

(23716, 2)

## Get priors for Vocab

In [27]:
vocab['p'] = vocab.n / vocab.n.sum()

## Add stems

In [28]:
stemmer = nltk.stem.porter.PorterStemmer()
vocab['port_stem'] = vocab.term_str.apply(lambda x: stemmer.stem(x))

## Define stopwords

In [29]:
stopwords = set(nltk.corpus.stopwords.words('english') + extra_stopwords)

In [30]:
# stopwords

In [31]:
sw = pd.DataFrame({'x':1}, index=stopwords)
vocab['stop'] = vocab.term_str.map(sw.x).fillna(0).astype('int')
# del(sw)

## Add term_ids to Tokens 

In [32]:
tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
    .set_index('term_str').term_id).fillna(-1).astype('int')

# Save

In [33]:
with sqlite3.connect(db_file) as db:
    tokens.to_sql('token', db, if_exists='replace', index=True)
    vocab.to_sql('vocab', db, if_exists='replace', index=True)

In [34]:
# END