# Module 3: Infer Language Models TEST

* DS 6001
* Raf Alvarado

We now create a series of langage models and evaluate them.

# Set Up

## Configure

In [3]:
text_file1 = '../MOD02--TextModels/austen-persuasion.csv'
text_file2 = '../MOD02--TextModels/austen-sense.csv'

In [4]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

## Import libraries

In [5]:
import pandas as pd
import numpy as np
%matplotlib inline

# Import and combine texts

In [6]:
text1 = pd.read_csv(text_file1)
text2 = pd.read_csv(text_file2)

In [7]:
text1.head()

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str
0,1,1,0,0,Sir
1,1,1,0,1,Walter
2,1,1,0,2,"Elliot,"
3,1,1,0,3,of
4,1,1,0,4,Kellynch


In [8]:
text1['book_id'] = 1
text2['book_id'] = 2

In [9]:
text1.head()

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,book_id
0,1,1,0,0,Sir,1
1,1,1,0,1,Walter,1
2,1,1,0,2,"Elliot,",1
3,1,1,0,3,of,1
4,1,1,0,4,Kellynch,1


In [24]:
tokens = pd.concat([text1, text2]).dropna()

In [25]:
tokens = tokens.set_index(OHCO)

In [27]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
1,1,1,0,0,Sir
1,1,1,0,1,Walter
1,1,1,0,2,"Elliot,"
1,1,1,0,3,of
1,1,1,0,4,Kellynch


In [45]:
tokens.token_str.value_counts().sort_index()

"                  1592
"'Tis                 1
"'Twill               1
"--                  22
"--As                 1
"--For                1
"--She                2
"--and                1
"--but                1
"--cried              2
"--he                 2
"--hesitatingly       1
"--in                 1
"--reciprocal         1
"--said               2
"--she                1
"--was                2
"A                   19
"About                3
"Add                  1
"Ah                  11
"Ah,                  1
"All                  3
"Almost               1
"Altered              1
"And                 56
"And--were            1
"Anne,                1
"Anne,"               1
"Another              2
                   ... 
you--all              1
you--and              1
you--sorry            1
you--what,            1
you:                  1
young               177
young,                9
younger              13
youngest              8
youngest,             1
youngest--to    

# Create a vocabulary

In [28]:
tokens['term_str'] = tokens['token_str'].str.lower().str.replace(r'[\W_]', '')

In [37]:
vocab = tokens['term_str'].value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'term_str':'n', 'index':'term_str'})\
    .sort_values('term_str')
vocab.index.name = 'term_id'

In [38]:
vocab.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,1758
3998,1.0,3
6681,15.0,1
7814,16.0,1
8700,1760.0,1


In [16]:
vocab.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7596,eyeswill,1
351,else,71
8119,judicious,1
84,how,353
2340,punishment,6


# Create Unigram Model

In [17]:
n_tokens = vocab.n.sum()
vocab['p'] = vocab['n'] / n_tokens
vocab['log_p'] = np.log2(vocab['p'])

In [18]:
n_tokens

203937

In [19]:
vocab.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,term_str,n,p,log_p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the,7421,0.036389,-4.780366
1,to,6872,0.033697,-4.89125
2,and,6227,0.030534,-5.033442
3,of,6136,0.030088,-5.054681
4,her,3731,0.018295,-5.772417
5,a,3636,0.017829,-5.809628
6,in,3316,0.01626,-5.942536
7,was,3182,0.015603,-6.002046
8,i,3074,0.015073,-6.051863
9,it,2755,0.013509,-6.209927


In [20]:
smooth = vocab['p'].min()
def predict_sentence(sent_str):
    tokens = pd.DataFrame(sent_str.lower().split(), columns=['term_str'])
    tokens = tokens.merge(vocab, on='term_str', how='left')
    tokens.loc[tokens['p'].isna(), ['p', 'log_p']] = [smooth, np.log2(smooth)]
    p = tokens['p'].product()
    log_p = tokens['log_p'].sum()
    print('-' * 80)
    print("p('{}') = {}; log2: {}".format(sent_str, p, log_p))
    print('-' * 80)
    print(tokens)
    print('-' * 80)

In [21]:
predict_sentence('I love you')
predict_sentence('I love cars')
predict_sentence("I want to")
predict_sentence("anne said to")
predict_sentence("said to her")
predict_sentence('she said')

--------------------------------------------------------------------------------
p('I love you') = 7.602946516155411e-08; log2: -23.648866116803514
--------------------------------------------------------------------------------
  term_str     n         p      log_p
0        i  3074  0.015073  -6.051863
1     love   117  0.000574 -10.767399
2      you  1793  0.008792  -6.829604
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
p('I love cars') = 4.24034942339956e-11; log2: -34.45702588967189
--------------------------------------------------------------------------------
  term_str       n         p      log_p
0        i  3074.0  0.015073  -6.051863
1     love   117.0  0.000574 -10.767399
2     cars     NaN  0.000005 -17.637764
--------------------------------------------------------------------------------
---------------------------------------------------------------------

# Buld N-Gram models

This function generates models up to the length specified.

In [23]:
def get_ngrams(tokens, n=2):
    
    # Create list to store copies of tokens table
    X = []
    
    # We convert the index to cols in order to change the value of token_num
    X.append(tokens['term_str'].reset_index())
        
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=OHCO, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(OHCO, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
            
    # Return just the ngram tables
    return X

## Generate three models

Unigram, bigram, and trigram

In [24]:
m1, m2, m3 = get_ngrams(tokens, n=3)

In [29]:
m3.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
w0,w1,w2,Unnamed: 3_level_1
even,old,ugly,1
superior,creature,and,1
bowing,to,the,1
ferrarss,name,by,1
exercise,which,called,1
a,reverie,of,2
these,four,months,1
lively,pain,as,1
take,my,oath,1
men,that,ever,1


## Compute joint probabilities

In [30]:
m1['p'] = m1['n'] / m1['n'].sum()
m2['p'] = m2['n'] / m2['n'].sum()
m3['p'] = m3['n'] / m3['n'].sum()

In [31]:
m1.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7421,0.036389
to,6872,0.033697
and,6227,0.030534
of,6136,0.030088
her,3731,0.018295


In [32]:
m2.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1
,<s>,1388,0.006806
of,the,856,0.004197
to,be,812,0.003982
in,the,679,0.003329
mrs,<s>,529,0.002594


In [37]:
m3.sort_values('p', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1
,<s>,<s>,1388,0.006806
mrs,<s>,<s>,529,0.002594
it,<s>,<s>,352,0.001726
her,<s>,<s>,236,0.001157
him,<s>,<s>,216,0.001059
mr,<s>,<s>,179,0.000878
them,<s>,<s>,157,0.00077
you,<s>,<s>,151,0.00074
me,<s>,<s>,145,0.000711
i,am,sure,106,0.00052


## Compute conditional probabilities

$p(w_1|w_0) = p(w_0, w_1) / p(w_0)$

$p(w_2|w_0,w_1) = p(w_0, w_1, w_2) / p(w_0, w_1)$

In [38]:
m2m = m2.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

In [45]:
m3m = m3.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

# Explore

In [47]:
m2m.loc[['he','she','it','anne','wentworth'], ['is','had','was','felt','thought','looked','said','saw']].style.background_gradient(cmap='Greens')

w1,is,had,was,felt,thought,looked,said,saw
w0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
he,0.0569898,0.146615,0.121286,0.00535801,0.00487092,0.00876766,0.0165611,0.00535801
she,0.0240876,0.15,0.134672,0.0182482,0.00985401,0.00510949,0.0105839,0.0160584
it,0.0940109,0.0228675,0.177858,0.000362976,0.000362976,0.000362976,0.00145191,0.0
anne,0.00443459,0.0842572,0.104213,0.0199557,0.00443459,0.00221729,0.00443459,0.00665188
wentworth,0.0157068,0.0418848,0.0942408,0.0,0.0,0.0052356,0.0104712,0.0052356


In [48]:
m2m.loc[['he','she'],['felt','said']].style.background_gradient(cmap='Greens')

w1,felt,said
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
he,0.00535801,0.0165611
she,0.0182482,0.0105839


# Generate Text

We use "stupid back-off" to account for missing ngrams.

In [50]:
def generate_text(start_word='she', n=250):
    words = [start_word]
    for i in range(n):
        if len(words) == 1:
            w = m2m.loc[start_word]
            next_word = m2m.loc[start_word].sample(weights=w).index.values[0]
        elif len(words) > 1:
            bg = tuple(words[-2:])
            try:
                w = m3m.loc[bg]
                next_word = m3m.loc[bg].sample(weights=w).index.values[0]
            except KeyError:
                ug = bg[1]
                if ug == '<s>':
                    next_word = m1.sample(weights=m1.p).index[0]
                else:
                    w = m2m.loc[ug]
                    next_word = m2m.loc[ug].sample(weights=w).index.values[0]
        words.append(next_word)
    text = ' '.join(words)
    text = text.replace(' <s> <s>', '.') + '.'
    text = text.upper() # To give that telegraph message look :-)
    print(text)

In [52]:
generate_text('the')

THE CRIME BECAUSE HAD ANY PRECISE LIMITS WAS INSTANTLY DISCOVERED TO BE UNREASONABLY DISCONTENTED WHEN A GALE CAME ON TUESDAY AND EVEN TO SOURNESS IN HER OPINION OF HERSELF AND DECLARE AN AFFECTION FOR ME. THEY HAD NOT THE RECOLLECTION THAT HE NEITHER EXPECTED NOR WISHED TO THE RELATION OF HER PEN WERE PROOFS ENOUGH OF HER MOTHERS SERVANT ON HEARING LUCYS MESSAGE. OR ANY OF HER FATHER CONTRASTED WITH SOME LARGE BOOKS BEFORE HIM MR. SUCCESSIVELY AND HE WANTS TO BE SOON BRINGING THEM TOGETHER AGAIN. CAN BE OF THE WHOLE LIST OF LADY RUSSELLS DOING THAT SHE HAD A NOTION SAID LUCY RETURNING AFTER A MOMENTS PAUSE HE SAID THOUGH I HAVE AN OBJECT TO MARRY IN SIX MONTHS OR EVEN TO KEEP AWAY FROM HER LYINGIN FOR I WAS AT LEAST SO LATELY IT HAD SEEMED TO HER DISTRESS. HALF A SMILE SMILED ALL THE REST OF THE MATTER WAS INDISPOSED. ENOUGH INCREASED BY OTHER COMPANY. REMOVED HE NOW RECKONS AS NOTHING. HIS VOICE AS I CAN EASILY BELIEVE TO DOUBT. THE SAME ROOM. LEAVE THE HOUSE. HIM. WHEN HE HEARS OF. 

In [53]:
generate_text('she')

SHE IS PRESENT. WHICH THE COLONEL WILL LEAVE ME IN HIS CIRCUMSTANCES ARE NOW AT WORK THEY WERE TO LEAVE SUSSEX. EVER ATTACH HER. AND LADIES EVIDENTLY HIS ACQUAINTANCE AND THEIR MARRIAGE INSTEAD OF STAYING THREE OR FOUR VERY BROAD STARES. WISHES. FOOTING. HE COULD NOT BEAR TO HAVE IT ON VERY EASY TERMS BELONGING TO IT AGAIN. DALRYMPLE TO REQUEST HER ASSISTANCE. HAVE SUPPOSED THAT HE TALKED WELL PROFESSED GOOD OPINIONS SEEMED TO REANIMATE TOWARDS THEM. AS SOON AS THE VERY BEST TONED PIANOFORTE I EVER OWN. FOR SOME DAYS IN A LOW VOICE ABOUT HER. ARE EXORBITANT. OF LIFE AND SPIRITS GOOD. ESCAPE THE SOLITARINESS AND THE FEW OCCASIONS OF ITS BEING FARTHER AUGMENTED HEREAFTER. THAN BY HIS FRIEND HAD RECOVERED HERSELF TO SPEAK THE OCCASION. ALL EXCUSE THE LIBERTY THE QUIET POSSESSION OF HER FATIGUE AND MADE HER OFTEN DEFICIENT IN UNDERSTANDING AND HIS SONS SON A STEADY RESPECTABLE YOUNG MAN IS REFUSED TILL HE WAS TO BE VISITING IN WESTGATE BUILDINGS AS ANNE VERY MUCH FOR MARIANNE. LEST SHE MIG