# Overview

We now create a series of langage models and evaluate them.

# Define Functions

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [0]:
pd.__version__

'0.22.0'

In [0]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

def text_to_tokens(src_file,
                   body_start=0, 
                   body_end=-1, 
                   chap_pat=r'^\s*Chapter.*$', 
                   para_pat=r'\n\n+', 
                   sent_pat=r'([.;?!"“”]+)', 
                   token_pat=r'([\W_]+)'):

    # Text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    del(lines)

    # Lines to Chapters
    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    del(df)

    # Chapters to Paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = OHCO[:2] #['chap_num', 'para_num']
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del(chaps)

    # Paragraphs to Sentences
    sents = paras.para_str.str.split(sent_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = OHCO[:3] #['chap_num', 'para_num', 'sent_num']
    del(paras)

    # Sentences to Tokens
    tokens = sents.sent_str.str.split(token_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'token_str'})
    tokens.index.names = OHCO #['chap_num', 'para_num', 'sent_num', 'token_num']
    del(sents)

    # Tag punction
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'\d').astype('int')
    
    # Extract vocab
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()
    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index()
    vocab.index.name = 'term_id'
    vocab = vocab.drop('index', 1)
        
    # Add term_ids to tokens 
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab

def get_docs(tokens, div_names, doc_str = 'term_id', sep='', flatten=False, 
             index_only=False):
    
    if not index_only:
        docs = tokens.groupby(div_names)[doc_str]\
          .apply(lambda x: x.str.cat(sep=sep))
        docs.columns = ['doc_content']
    else:
        docs = tokens.groupby(div_names)[doc_str].apply(lambda x: x.tolist())

    if flatten:
        docs = docs.reset_index().drop(div_names, 1)
    
    return docs

def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

# Import Tokens

In [0]:
src_file = '2701-0.txt'

In [0]:
import os
if not os.path.exists(src_file):
  import requests
  with open(src_file, 'w', encoding='utf-8') as src_file_on_disk:
    src_file_url = 'https://www.gutenberg.org/files/2701/2701-0.txt'
    src_file = requests.get(src_file_url).text
    src_file_on_disk.write(src_file)

In [0]:
cfg = dict(
    src_file = src_file,
    body_start = 341,
    body_end = 21964,
    chap_pat = r'^\s*(?:CHAPTER|ETYMOLOGY|Epilogue).*$'
)
K, V = text_to_tokens(**cfg)

In [0]:
V.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,a,4737
1,aback,2
2,abaft,2
3,abandon,3
4,abandoned,7


In [0]:
K.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,punc,num,term_str,term_id
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
130,9,18,31,,1,0,,-1
12,7,4,20,own,0,0,own,10299
109,18,12,27,,1,0,,-1
0,77,4,29,,1,0,,-1
8,4,8,21,,1,0,,-1


# Buld N-Gram models

## Create training and test sets from K

In [0]:
G = K.groupby(OHCO[:3])\
  .apply(lambda x: np.random.choice(['train', 'test'], p=[.8, .2]))\
  .to_frame().rename(columns={0:'group'})

In [0]:
K = pd.merge(K.reset_index(), G.reset_index(), on=OHCO[:3], how='left')
K = K.set_index(OHCO, drop=True)

In [0]:
K.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,punc,num,term_str,term_id,group
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,ETYMOLOGY,0,0,etymology,5005,train
0,0,1,0,,1,0,,-1,test
0,0,1,1,.,1,0,,-1,test
0,0,1,2,,1,0,,-1,test
0,0,2,0,,1,0,,-1,train


In [0]:
TRAIN = K.groupby('group').get_group('train')
TEST = K.groupby('group').get_group('test')

## Create n-gram tables

### Define function

In [0]:
def get_ngrams(tokens, n=2):
    
    # Create list to store copies of tokens table
    X = []
    
    # Add tokens without punc to list
    # Note: we assume that tokens has an OHC) multiindex
    X.append(tokens.loc[tokens.punc==0, 'term_str'].reset_index())
    
    # Normalize the sequence number for token numbers for offsetting operation
    # Note: we assume that punc removal leaves a number series with regular gaps
    X[0]['token_num'] = (X[0]['token_num'] / 2) 
    X[0]['token_num'] = X[0]['token_num'].astype('int')
    
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous
    IDX = ['chap_num', 'para_num', 'sent_num', 'token_num'] 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=IDX, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(IDX, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
    
    # Return just the ngram tables
    return X

### Apply function to training and test sets

In [0]:
UGM, BGM, TGM = get_ngrams(TRAIN, n=3)
UGT, BGT, TGT = get_ngrams(TEST, n=3)

### Align training and test tables

Here we make sure that the traing and test tables have the same ngrams, and add 1 to cases where a value was absent.

In [0]:
def align_model(ngm, ngt):
  idx = ngm.index.names
  ngm = pd.merge(ngm.reset_index(), ngt.reset_index(), on=idx, how='outer').fillna(1).set_index(idx)
  ngm = ngm.rename(columns={'n_x':'n'})
  ngm = ngm.drop('n_y', 1)
  return ngm

In [0]:
UGM = align_model(UGM, UGT)

In [0]:

BGM = align_model(BGM, BGT)

In [0]:

TGM = align_model(TGM, TGT)

## Infer probabilities for training set

### Define function 

In [0]:
def infer_probs(ngm):
    if len(ngm.index.names) > 1:
        ngm['p'] = ngm.groupby(ngm.index.names[:-1]).n\
            .apply(lambda x: x / x.sum())\
            .to_frame().rename(columns={'n':'p'})
    else:
        ngm['p'] = ngm['n'] / ngm['n'].sum()
    ngm['logp'] = np.log2(ngm['p'])
    ngm['h'] = ngm.logp * ngm.p * -1
    return ngm

### Apply function

In [0]:
UGM = infer_probs(UGM)

In [0]:
BGM = infer_probs(BGM)

In [0]:
TGM = infer_probs(TGM)

### View results

In [0]:
TGT.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
w0,w1,w2,Unnamed: 3_level_1
<s>,<s>,<s>,23
<s>,<s>,but,1
<s>,<s>,elephants,1
<s>,<s>,firkins,1
<s>,<s>,have,1


In [0]:
TGM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p,logp,h
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
<s>,<s>,<s>,146.0,0.895706,-0.158904,0.142331
<s>,<s>,barrels,2.0,0.01227,-6.348728,0.077899
<s>,<s>,but,1.0,0.006135,-7.348728,0.045084
<s>,<s>,elephants,1.0,0.006135,-7.348728,0.045084
<s>,<s>,firkins,1.0,0.006135,-7.348728,0.045084


## Compute performance of models

### Define function

We use the following formula for perplexity, where ***b*** = 2. 

![alt text](http://ontoligent.com/images/perplexity-formula.png)

In [0]:
def perplexity(ngm, ngt):
    pp = np.exp2(-(ngm['logp'] * ngt['n']).sum() / ngt['n'].sum())
    return round(pp, 2)

### Apply function

In [0]:
ppu = perplexity(UGM, UGT)
ppb = perplexity(BGM, BGT)
ppt = perplexity(TGM, TGT)

### View results

In [0]:
ppu, ppb, ppt

(1041.61, 51.81, 3.33)

# Generate Text

In [0]:
test = ''
n = 500

TGM = TGM.sort_index()

idx = TGM.index.names

tg = TGM.sample().reset_index()[idx].values.tolist()[0]
test += ' '.join(tg) + ' ...'

for i in range(n):
    key = tuple(tg[1:])
    weights = TGM.loc[key, 'p']
    w2 = TGM.loc[key].sample(weights=weights)\
        .reset_index()[idx[-1]].values.tolist()[0]
    
    if w2 == '<s>':
        continue
    
    tg = tg[1:] + [w2]

    if i % 10 == 1:
        test += '\n'
    else:
        test += ' '
    test += w2
    
print(test)

brown and burnt ... making
his white ash breeze
