# Overview

We now create a series of langage models and evaluate them.

# Define Functions

In [10]:
import pandas as pd
import numpy as np
import sqlite3
import textman as tx
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [11]:
pd.__version__

'0.23.4'

In [12]:
OHCO = ['book_num','chap_num', 'para_num', 'sent_num', 'token_num']
print([OHCO[:4]])

[['book_num', 'chap_num', 'para_num', 'sent_num']]


In [13]:
def text_to_tokens(src_file,
                   body_start=0, 
                   body_end=-1, 
                   chap_pat=r'^\s*Chapter.*$', 
                   para_pat=r'\n\n+', 
                   sent_pat=r'([.;?!"“”]+)', 
                   token_pat=r'([\W_]+)'):

    # Text to lines
    lines = open(src_file, 'r', encoding='utf-8').readlines()
    lines = lines[body_start - 1 : body_end + 1]
    df = pd.DataFrame({'line_str':lines})
    df.index.name = 'line_id'
    del(lines)

    # Lines to Chapters
    mask = df.line_str.str.match(chap_pat)
    df.loc[mask, 'chap_id'] = df.apply(lambda x: x.name, 1)
    df.chap_id = df.chap_id.ffill().astype('int')
    chap_ids = df.chap_id.unique().tolist()
    df['chap_num'] = df.chap_id.apply(lambda x: chap_ids.index(x))
    chaps = df.groupby('chap_num')\
        .apply(lambda x: ''.join(x.line_str))\
        .to_frame()\
        .rename(columns={0:'chap_str'})
    del(df)

    # Chapters to Paragraphs
    paras = chaps.chap_str.str.split(para_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'para_str'})
    paras.index.names = OHCO[:2] #['chap_num', 'para_num']
    paras.para_str = paras.para_str.str.strip()
    paras.para_str = paras.para_str.str.replace(r'\n', ' ')
    paras.para_str = paras.para_str.str.replace(r'\s+', ' ')
    paras = paras[~paras.para_str.str.match(r'^\s*$')]
    del(chaps)

    # Paragraphs to Sentences
    sents = paras.para_str.str.split(sent_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    sents.index.names = OHCO[:3] #['chap_num', 'para_num', 'sent_num']
    del(paras)

    # Sentences to Tokens
    tokens = sents.sent_str.str.split(token_pat, expand=True)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'token_str'})
    tokens.index.names = OHCO #['chap_num', 'para_num', 'sent_num', 'token_num']
    del(sents)

    # Tag punction
    tokens['punc'] = tokens.token_str.str.match(r'^[\W_]*$').astype('int')
    tokens['num'] = tokens.token_str.str.match(r'\d').astype('int')
    
    # Extract vocab
    WORDS = (tokens.punc == 0) & (tokens.num == 0)
    tokens.loc[WORDS, 'term_str'] = tokens.token_str.str.lower()
    vocab = tokens[tokens.punc == 0].term_str.value_counts().to_frame()\
        .reset_index()\
        .rename(columns={'index':'term_str', 'term_str':'n'})
    vocab = vocab.sort_values('term_str').reset_index()
    vocab.index.name = 'term_id'
    vocab = vocab.drop('index', 1)
        
    # Add term_ids to tokens 
    tokens['term_id'] = tokens['term_str'].map(vocab.reset_index()\
        .set_index('term_str').term_id).fillna(-1).astype('int')

    return tokens, vocab

def get_docs(tokens, div_names, doc_str = 'term_id', sep='', flatten=False, 
             index_only=False):
    
    if not index_only:
        docs = tokens.groupby(div_names)[doc_str]\
          .apply(lambda x: x.str.cat(sep=sep))
        docs.columns = ['doc_content']
    else:
        docs = tokens.groupby(div_names)[doc_str].apply(lambda x: x.tolist())

    if flatten:
        docs = docs.reset_index().drop(div_names, 1)
    
    return docs

def get_term_id(vocab, term_str):
    return vocab[vocab.term_str == term_str].index[0]

def get_term_str(vocab, term_id):
    return vocab.loc[term_id].term_str

# Import Tokens

In [14]:
max_words = 10000000
corpus_db = "/sfs/qumulo/qhome/sk5be/DS5559/HarryPotter.db"
sql = """
SELECT * FROM token 
WHERE term_id IN (
    SELECT term_id FROM vocab 
    WHERE stop = 0 
    AND term_str NOT IN ('said')
)
AND (pos NOT LIKE 'NNP%')
""".format(max_words)
sql_vocab = """
SELECT * FROM vocab """.format(max_words)

In [15]:
with sqlite3.connect(corpus_db) as db:
    tokens = pd.read_sql(sql,db)

In [16]:
with sqlite3.connect(corpus_db) as db:
    vocab = pd.read_sql(sql_vocab,db)

In [17]:
V = vocab.copy()
K = tokens.copy()

In [19]:
V.head()

Unnamed: 0,term_id,term_str,n,p,port_stem,stop,df,tf_sum,tf_mean,tf_max,tfidf_sum,tfidf_mean,tfidf_max,tfth_sum,tfth_mean,tfth_max,th_sum,th_mean,th_max,idf
0,0,''just,1,8.912283e-07,''just,0,1,0.000246,1e-06,0.000246,2.298853,0.011552,2.298853,7.236781e-07,3.636573e-09,7.234508e-07,0.002947,1.5e-05,0.002944,2.298853
1,1,''professor,1,8.912283e-07,''professor,0,1,0.000332,2e-06,0.000332,2.298853,0.011552,2.298853,1.273337e-06,6.398678e-09,1.273041e-06,0.003837,1.9e-05,0.003835,2.298853
2,2,''was,1,8.912283e-07,''wa,0,1,0.000465,2e-06,0.000464,2.298853,0.011552,2.298853,2.390064e-06,1.201037e-08,2.389668e-06,0.005145,2.6e-05,0.005143,2.298853
3,3,'a,52,4.634387e-05,'a,0,30,0.015691,7.9e-05,0.002221,42.730055,0.214724,4.108659,0.00261353,1.313332e-05,0.0003699704,0.166562,0.000837,0.019579,0.821732
4,4,'aaaaaah,1,8.912283e-07,'aaaaaah,0,1,0.000496,2e-06,0.000496,2.298853,0.011552,2.298853,2.697485e-06,1.35552e-08,2.697066e-06,0.005443,2.7e-05,0.00544,2.298853


In [20]:
K.sample(5)

Unnamed: 0,index,book_num,chap_num,para_num,sent_num,token_num,pos,token_str,punc,num,term_str,term_id,author,genre
434506,1087478,6,30,15,8,17,VB,speak,0,0,speak,19354,J.K.Rowling,Fantasy
87429,222993,2,10,67,4,2,NN,fellowship,0,0,fellowship,8205,J.K.Rowling,Fantasy
58635,150199,1,14,3,5,31,JJ,irritated,0,0,irritated,11536,J.K.Rowling,Fantasy
181343,457681,3,34,1,57,2,VBD,wanted,0,0,wanted,22828,J.K.Rowling,Fantasy
435016,1088761,6,30,17,15,33,VBN,charred,0,0,charred,4187,J.K.Rowling,Fantasy


# Buld N-Gram models

## Create training and test sets from K

In [16]:
G = K.groupby(OHCO[:4])\
  .apply(lambda x: np.random.choice(['train', 'test'], p=[.8, .2]))\
  .to_frame().rename(columns={0:'group'})

In [17]:
K = pd.merge(K.reset_index(), G.reset_index(), on=OHCO[:4], how='left')
K = K.set_index(OHCO, drop=True)

In [18]:
K.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,index,pos,token_str,punc,num,term_str,term_id,group
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,0,0,0,0,NN,CHAPTER,0,0,chapter,4164,test
0,0,2,0,6,1,NN,number,0,0,number,14134,test
0,0,2,0,7,2,CD,four,0,0,four,8850,test
0,0,2,0,13,3,JJ,proud,0,0,proud,16076,test
0,0,2,0,19,4,RB,perfectly,0,0,perfectly,15107,test


In [43]:
TRAIN = K.groupby('group').get_group('train')
TEST = K.groupby('group').get_group('test')

In [68]:
TRAIN_1 = TRAIN.query('book_num == "0"')
TEST_1 = TEST.query('book_num == "0"')
TRAIN_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,index,pos,token_str,punc,num,term_str,term_id,group
book_num,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,2,1,3,7,JJ,last,0,0,last,12056,train
0,0,2,1,4,8,NNS,people,0,0,people,15072,train
0,0,2,1,6,9,MD,'d,0,0,'d,218,train
0,0,2,1,7,10,VB,expect,0,0,expect,7842,train
0,0,2,1,10,11,VBN,involved,0,0,involved,11491,train


## Create n-gram tables

### Define function

In [69]:
def get_ngrams(tokens, n=2):
    
    # Create list to store copies of tokens table
    X = []
    
    # Add tokens without punc to list
    # Note: we assume that tokens has an OHC) multiindex
    X.append(tokens.loc[tokens.punc==0, 'term_str'].reset_index())
    
    # Normalize the sequence number for token numbers for offsetting operation
    # Note: we assume that punc removal leaves a number series with regular gaps
    X[0]['token_num'] = (X[0]['token_num'] / 2) 
    X[0]['token_num'] = X[0]['token_num'].astype('int')
    
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous
    IDX = ['book_num','chap_num', 'para_num', 'sent_num', 'token_num'] 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=IDX, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(IDX, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
    
    # Return just the ngram tables
    return X

### Apply function to training and test sets

In [70]:
UGM, BGM, TGM = get_ngrams(TRAIN, n=3)
UGT, BGT, TGT = get_ngrams(TEST, n=3)

### Align training and test tables

Here we make sure that the traing and test tables have the same ngrams, and add 1 to cases where a value was absent.

In [72]:
def align_model(ngm, ngt):
  idx = ngm.index.names
  ngm = pd.merge(ngm.reset_index(), ngt.reset_index(), on=idx, how='outer').fillna(1).set_index(idx)
  ngm = ngm.rename(columns={'n_x':'n'})
  ngm = ngm.drop('n_y', 1)
  return ngm

In [73]:
UGM = align_model(UGM, UGT)

In [74]:

BGM = align_model(BGM, BGT)

In [75]:

TGM = align_model(TGM, TGT)

## Infer probabilities for training set

### Define function 

In [76]:
def infer_probs(ngm):
    if len(ngm.index.names) > 1:
        ngm['p'] = ngm.groupby(ngm.index.names[:-1]).n\
            .apply(lambda x: x / x.sum())\
            .to_frame().rename(columns={'n':'p'})
    else:
        ngm['p'] = ngm['n'] / ngm['n'].sum()
    ngm['logp'] = np.log2(ngm['p'])
    ngm['h'] = ngm.logp * ngm.p * -1
    return ngm

### Apply function

In [77]:
UGM = infer_probs(UGM)

In [78]:
BGM = infer_probs(BGM)

In [79]:
TGM = infer_probs(TGM)

### View results

In [80]:
TGT.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
w0,w1,w2,Unnamed: 3_level_1
''professor,<s>,<s>,1
''was,<s>,<s>,1
'a,<s>,<s>,4
'a,people,<s>,1
'abercrombie,<s>,<s>,1


In [81]:
TGM.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p,logp,h
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
''just,yelled,<s>,1.0,1.0,0.0,-0.0
'a,<s>,<s>,24.0,1.0,0.0,-0.0
'a,addacked,bud,1.0,1.0,0.0,-0.0
'a,bold,<s>,1.0,1.0,0.0,-0.0
'a,centaur,<s>,1.0,1.0,0.0,-0.0


## Compute performance of models

### Define function

We use the following formula for perplexity, where ***b*** = 2. 

![alt text](http://ontoligent.com/images/perplexity-formula.png)

In [82]:
def perplexity(ngm, ngt):
    pp = np.exp2(-(ngm['logp'] * ngt['n']).sum() / ngt['n'].sum())
    return round(pp, 2)

### Apply function

In [83]:
ppu = perplexity(UGM, UGT)
ppb = perplexity(BGM, BGT)
ppt = perplexity(TGM, TGT)

### View results

In [84]:
ppu, ppb, ppt

(3119.59, 15.65, 1.29)

# Generate Text

In [86]:
test = ''
n = 10000

TGM = TGM.sort_index()

idx = TGM.index.names

tg = TGM.sample().reset_index()[idx].values.tolist()[0]
test += ' '.join(tg) + ' ...'

for i in range(n):
    key = tuple(tg[1:])
    weights = TGM.loc[key, 'p']
    w2 = TGM.loc[key].sample(weights=weights)\
        .reset_index()[idx[-1]].values.tolist()[0]
    
    if w2 == '<s>':
        continue
    
    tg = tg[1:] + [w2]

    if i % 10 == 1:
        test += '\n'
    else:
        test += ' '
    test += w2
    
print(test)

's good months ... telling
