# Overview

We annotate our model, thus converting it from F2 to F3.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

# Import NLTK and download resources

If you need to install NLTK, see the [instructions here](https://www.nltk.org/install.html). You can also install this with Anaconda, like so:

`conda install nltk`

Once you have installed NLTK, you will need to download resources, which will happen when you run the following cell. If the interactive window opens, you may need to set your NLTK Data Directory, as described in the [instructions here](https://www.nltk.org/data.html). To set the directory, click on the File menu and select Change Download Directory. For central installation, set this to `C:\nltk_data` (Windows),`/usr/local/share/nltk_data` (Mac), or `/usr/share/nltk_data` (Unix). 

> If you did not install the data to one of the above central locations, you will need to set the NLTK_DATA environment variable to specify the location of the data. (On a Windows machine, right click on “My Computer” then select Properties > Advanced > Environment Variables > User Variables > New...)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /Users/rca2t/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rca2t/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/rca2t/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package tagsets to /Users/rca2t/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

# Import Tokens

In [3]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

def text_to_tokens(src_file,
                   body_start=0, 
                   body_end=-1, 
                   chap_pat=r'^\s*Chapter.*$', 
                   para_pat=r'\n\n+', 
                   sent_pat=r'([.;?!"“”]+)', 
                   token_pat=r'([\W_]+)'):

    # Text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    del(lines)

    # Lines to Chapters
    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    del(df)

    # Chapters to Paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = OHCO[:2]
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del(chaps)

    # Paragraphs to Sentences
    sents = paras.para_str.str.split(sent_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = OHCO[:3]
    del(paras)

    # Sentences to Tokens
    tokens = sents.sent_str.str.split(token_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'token_str'})
    tokens.index.names = OHCO
    del(sents)

    # Tag punction
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'\d').astype('int')
    
    # Extract vocab
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()
    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index()
    vocab.index.name = 'term_id'
    vocab = vocab.drop('index', 1)
        
    # Add term_ids to tokens 
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab

def get_docs(tokens, div_names, doc_str = 'term_id', sep='', flatten=False, 
             index_only=False):
    
    if not index_only:
        docs = tokens.groupby(div_names)[doc_str]\
          .apply(lambda x: x.str.cat(sep=sep))
        docs.columns = ['doc_content']
    else:
        docs = tokens.groupby(div_names)[doc_str].apply(lambda x: x.tolist())

    if flatten:
        docs = docs.reset_index().drop(div_names, 1)
    
    return docs

def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

In [4]:
src_file = '2701-0.txt'

In [5]:
import os
if not os.path.exists(src_file):
  import requests
  with open(src_file, 'w', encoding='utf-8') as src_file_on_disk:
    src_file_url = 'https://www.gutenberg.org/files/2701/2701-0.txt'
    src_file = requests.get(src_file_url).text
    src_file_on_disk.write(src_file)

In [6]:
cfg = dict(
    src_file = src_file,
    body_start = 341,
    body_end = 21964,
    chap_pat = r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$'
)
K, V = text_to_tokens(**cfg)

In [7]:
V.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,a,4737
1,aback,2
2,abaft,2
3,abandon,3
4,abandoned,7


In [8]:
K.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,punc,num,term_str,term_id
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
119,3,6,7,,1,0,,-1
22,15,0,16,to,0,0,to,15073
107,4,2,53,-,1,0,,-1
133,18,4,3,,1,0,,-1
34,5,48,5,,1,0,,-1


# Add Stopwords to Vocab

We use NLTK's built in stopword list for English. Note that we can add and subtract from this list, or just create our own list and keep it in our data model.

In [9]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [10]:
sw

Unnamed: 0_level_0,dummy
term_str,Unnamed: 1_level_1
i,1
me,1
my,1
myself,1
we,1
our,1
ours,1
ourselves,1
you,1
you're,1


In [11]:
V['stop'] = V.term_str.map(sw.dummy)
V['stop'] = V['stop'].fillna(0).astype('int')

In [12]:
V[V.stop == 1].sample(10)

Unnamed: 0_level_0,term_str,n,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14848,them,474,1
16917,your,249,1
4547,each,128,1
9926,o,44,1
4299,do,307,1
12532,s,1801,1
6950,him,1067,1
6880,her,332,1
7437,in,4173,1
755,as,1741,1


# Add Stems to Vocab

In [13]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
V['p_stem'] = V.term_str.apply(stemmer.stem)

In [14]:
V.sample(10)

Unnamed: 0_level_0,term_str,n,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15433,tusk,2,0,tusk
6267,gleig,1,0,gleig
559,antiquity,5,0,antiqu
10199,outblown,1,0,outblown
1034,baltic,3,0,baltic
14662,talks,1,0,talk
5115,executors,2,0,executor
146,acute,1,0,acut
12169,resolved,13,0,resolv
3803,deliciousness,2,0,delici


# Add POS to Tokens

This a token-level feature -- not a vocab featue

In [15]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [19]:
def add_pos(row):
    sent = list(row)
    if sent[0] == '':
        sent[0] = '~'
    try:
        pos_list = nltk.pos_tag(sent)
    except IndexError:
        pos_list = [('~','~')]
    pos = pd.Series(pos_list).to_frame().stack().reset_index()
    return pos

In [20]:
POS = K.groupby(OHCO[:3]).token_str.apply(lambda x: add_pos(x)).rename(columns={0:'pos'}).drop(['level_0','level_1'],1)

In [21]:
POS.index.names = OHCO

In [22]:
POS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
0,0,0,0,"(ETYMOLOGY, NN)"
0,0,1,0,"(~, ~)"
0,0,2,0,"(~, NN)"
0,1,0,0,"(~, NN)"
0,1,0,1,"((, ()"


In [23]:
K.loc[K.punc==0, 'pos_tuple'] = POS['pos']
K.loc[K.punc==0, 'pos'] = POS['pos'].apply(lambda x: x[1])

In [24]:
K.loc[K.punc==0].sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,punc,num,term_str,term_id,pos_tuple,pos
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
106,6,6,30,idleness,0,0,idleness,7276,"(idleness, NN)",NN
20,3,4,14,of,0,0,of,10034,"(of, IN)",IN
42,3,14,36,imperial,0,0,imperial,7378,"(imperial, JJ)",JJ
36,33,36,46,rolls,0,0,rolls,12412,"(rolls, NNS)",NNS
135,38,12,4,he,0,0,he,6765,"(he, PRP)",PRP
42,28,4,18,prairies,0,0,prairies,11165,"(prairies, NNS)",NNS
27,9,32,36,continent,0,0,continent,3124,"(continent, NN)",NN
70,5,0,14,was,0,0,was,16389,"(was, VBD)",VBD
87,11,6,54,of,0,0,of,10034,"(of, IN)",IN
3,6,2,32,cheating,0,0,cheating,2373,"(cheating, NN)",NN


# See Results

In [25]:
V.head()

Unnamed: 0_level_0,term_str,n,stop,p_stem
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,a,4737,1,a
1,aback,2,0,aback
2,abaft,2,0,abaft
3,abandon,3,0,abandon
4,abandoned,7,0,abandon


In [26]:
K.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,punc,num,term_str,term_id,pos_tuple,pos
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,ETYMOLOGY,0,0,etymology,5005,"(ETYMOLOGY, NN)",NN
0,0,1,0,,1,0,,-1,,
0,0,1,1,.,1,0,,-1,,
0,0,1,2,,1,0,,-1,,
0,0,2,0,,1,0,,-1,,
