# Module 3: Infer Language Models

* DS 6001
* Raf Alvarado

We now create a series of langage models and evaluate them.

# Configure

In [103]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
text_file1 = '../MOD02--TextModels/austen-persuasion.csv'
text_file2 = '../MOD02--TextModels/austen-sense.csv'

# Set Up

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

# Import and combine texts

In [145]:
text1 = pd.read_csv(text_file1, index_col=OHCO[1:])
text2 = pd.read_csv(text_file2, index_col=OHCO[1:])

In [146]:
text1['book_id'] = 1
text2['book_id'] = 2

In [147]:
tokens = pd.concat([text1, text2]).dropna()
tokens = tokens.reset_index().set_index(OHCO)

In [148]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
1,1,1,0,0,Sir
1,1,1,0,1,Walter
1,1,1,0,2,"Elliot,"
1,1,1,0,3,of
1,1,1,0,4,Kellynch


In [150]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,Sir,sir
1,1,1,0,1,Walter,walter
1,1,1,0,2,"Elliot,",elliot
1,1,1,0,3,of,of
1,1,1,0,4,Kellynch,kellynch


# Create a vocabulary

In [151]:
tokens['term_str'] = tokens['token_str'].str.lower().str.replace(r'[\W_]', '')

In [152]:
vocab = tokens['term_str'].value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'term_str':'n', 'index':'term_str'})\
    .sort_values('term_str')
vocab.index.name = 'term_id'

In [153]:
vocab.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,1758
3831,1.0,3
9060,15.0,1
8188,16.0,1
6656,1760.0,1


# Create Unigram Model

In [154]:
n_tokens = vocab.n.sum()
vocab['p'] = vocab['n'] / n_tokens
vocab['log_p'] = np.log2(vocab['p'])

In [155]:
vocab.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,term_str,n,p,log_p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the,7421,0.036389,-4.780366
1,to,6872,0.033697,-4.89125
2,and,6227,0.030534,-5.033442
3,of,6136,0.030088,-5.054681
4,her,3731,0.018295,-5.772417
5,a,3636,0.017829,-5.809628
6,in,3316,0.01626,-5.942536
7,was,3182,0.015603,-6.002046
8,i,3074,0.015073,-6.051863
9,it,2755,0.013509,-6.209927


In [156]:
smooth = vocab['p'].min()
def predict_sentence(sent_str):
    tokens = pd.DataFrame(sent_str.lower().split(), columns=['term_str'])
    tokens = tokens.merge(vocab, on='term_str', how='left')
    tokens.loc[tokens['p'].isna(), ['p', 'log_p']] = [smooth, np.log2(smooth)]
    p = tokens['p'].product()
    log_p = tokens['log_p'].sum()
    print('-' * 80)
    print("p('{}') = {}; log2: {}".format(sent_str, p, log_p))
    print('-' * 80)
    print(tokens)
    print('-' * 80)

In [369]:
predict_sentence('I love you')
predict_sentence('I love cars')
predict_sentence("I want to")
predict_sentence("anne said to")
predict_sentence("said to her")
predict_sentence('she said')

--------------------------------------------------------------------------------
p('I love you') = 7.602946516155411e-08; log2: -23.648866116803514
--------------------------------------------------------------------------------
  term_str     n         p      log_p
0        i  3074  0.015073  -6.051863
1     love   117  0.000574 -10.767399
2      you  1793  0.008792  -6.829604
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
p('I love cars') = 4.24034942339956e-11; log2: -34.45702588967189
--------------------------------------------------------------------------------
  term_str       n         p      log_p
0        i  3074.0  0.015073  -6.051863
1     love   117.0  0.000574 -10.767399
2     cars     NaN  0.000005 -17.637764
--------------------------------------------------------------------------------
---------------------------------------------------------------------

# Buld N-Gram models

This function generates models up to the length specified.

In [159]:
def get_ngrams(tokens, n=2):
    
    # Create list to store copies of tokens table
    X = []
    
    # We convert the index to cols in order to change the value of token_num
    X.append(tokens['term_str'].reset_index())
        
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=OHCO, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(OHCO, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
            
    # Return just the ngram tables
    return X

## Generate three models

Unigram, bigram, and trigram

In [224]:
m1, m2, m3 = get_ngrams(tokens, n=3)

## Compute joint probabilities

In [225]:
m1['p'] = m1['n'] / m1['n'].sum()
m2['p'] = m2['n'] / m2['n'].sum()
m3['p'] = m3['n'] / m3['n'].sum()

In [226]:
m1.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7421,0.036389
to,6872,0.033697
and,6227,0.030534
of,6136,0.030088
her,3731,0.018295


In [227]:
m2.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1
,<s>,1388,0.006806
of,the,856,0.004197
to,be,812,0.003982
in,the,679,0.003329
mrs,<s>,529,0.002594


In [365]:
m3.sort_values('p', ascending=False).sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p,p_w0w1,p_w2gw0w1
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
affection,for,marianne,4,2e-05,0.000113,0.173913
belief,of,more,1,5e-06,2.9e-05,0.166667
nerves,susceptible,to,1,5e-06,5e-06,1.0
poets,trying,to,1,5e-06,5e-06,1.0
him,a,book,1,5e-06,8.3e-05,0.058824


## Compute conditional probabilities

$p(w_1|w_0) = p(w_0, w_1) / p(w_0)$

$p(w_2|w_0,w_1) = p(w_0, w_1, w_2) / p(w_0, w_1)$

In [320]:
m2m = m2.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

In [321]:
m3m = m3.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

## Explore

In [324]:
m2m.loc[['he','she','it','anne','wentworth'], ['is','had','was','felt','thought','looked','said','saw']].style.background_gradient(cmap='Greens')

w1,is,had,was,felt,thought,looked,said,saw
w0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
he,0.0569898,0.146615,0.121286,0.00535801,0.00487092,0.00876766,0.0165611,0.00535801
she,0.0240876,0.15,0.134672,0.0182482,0.00985401,0.00510949,0.0105839,0.0160584
it,0.0940109,0.0228675,0.177858,0.000362976,0.000362976,0.000362976,0.00145191,0.0
anne,0.00443459,0.0842572,0.104213,0.0199557,0.00443459,0.00221729,0.00443459,0.00665188
wentworth,0.0157068,0.0418848,0.0942408,0.0,0.0,0.0052356,0.0104712,0.0052356


In [359]:
m2m.loc[['he','she'],['felt','said']].style.background_gradient(cmap='Greens')

w1,felt,said
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
he,0.00535801,0.0165611
she,0.0182482,0.0105839


# Generate Text

We use "stupid back-off" to account for missing ngrams.

In [362]:
def generate_text(start_word='she', n=250):
    words = [start_word]
    for i in range(n):
        if len(words) == 1:
            w = m2m.loc[start_word]
            next_word = m2m.loc[start_word].sample(weights=w).index.values[0]
        elif len(words) > 1:
            bg = tuple(words[-2:])
            try:
                w = m3m.loc[bg]
                next_word = m3m.loc[bg].sample(weights=w).index.values[0]
            except KeyError:
                ug = bg[1]
                if ug == '<s>':
                    next_word = m1.sample(weights=m1.p).index[0]
                else:
                    w = m2m.loc[ug]
                    next_word = m2m.loc[ug].sample(weights=w).index.values[0]
        words.append(next_word)
    text = ' '.join(words)
    text = text.replace(' <s> <s>', '.') + '.'
    text = text.upper() # To give that telegraph message look :-)
    print(text)

In [363]:
generate_text('the')

THE MOMENT. CAUGHT BY OTHER COMPANY. YOUR SIGHT AS THE NIECES OF THE PARTY. OF VERY USEFUL HAD MADE A VERY PLEASANT AND THE WARM BATH. I THINK ENDS THE RESEMBLANCE I HAVE REPEATED IT TO BE THOUGHT SO AT THE WINDOW IN HOPES AS I TALKED ONLY OF MARIANNE SHE IS OF SOME BOOKS THAT HE SEEMED RATHER SURPRISING TO HIM THOUGH LATELY ACQUIRED IS VERY UNFORTUNATE. BLUSHED OVER THE FIRE DO NOT EXPECT TO GET REAL INFORMATION IN THE ROOM ON THEIR SUMMITS WERE A DREADFUL EXTENSION OF YOUR INTENTION. SHE SHOULD SUFFER. EDWARD FERRARS WAS THE LIGHT AND WARMTH OF HER CHILDREN AND FRIENDS. NOW THAT WE HOPE TO SEE HER WITH SOME DEGREE OF SELFDENIAL WHICH HER OWN VOICE NOW SAID. FOR SOME TIME THUS SPENT IN ALL ESSENTIALS THAN ADMIRAL CROFT BID FAIR TO EQUAL HER FATHER HAD NO HOPE OF FINDING HER WAY WAS EQUALLY THE PERSUASION THAT HE SHOULD COME TO SEE THE LIKE. LIKES ME NOW TO PERCEIVE THAT HER SUCCESS WAS SPEEDY AND FOR SOME MINUTES AND THEN I HOPE THERE IS A KIND OF COLD HEARTED SELFISHNESS ON BOTH SIDE

In [364]:
generate_text('she')

SHE WAS SO ATTENTIVE TO ME AGAIN AND MR MUSGROVE TO KEEP OUT OF HER OWN SISTER MUST ALLOW FOR AN ANSWER AND MAKE OF HER REMOVAL WAS MORE THAN THE COTTAGE AFFORDED A GENERAL WISH TO SEE HER EQUAL. PRATT CAN GIVE OCCASION TO SUCH A MAN WHOSE PREVAILING ANXIETY WAS THE ELDEST BOYS BEING AT THAT TIME OF YEAR AND AFTER THINKING IT A BAD ONE I DO WELL AND HE LOOKED AT HER SIDE AND THOUGHTFULNESS ON HIS OWN ENJOYMENT OR HIS OWN CHOICE. HAVE BEEN RUN UP STAIRS INTO THE ROOM. IN LOVE. HAVING THUS SUPPORTED THE DIGNITY OF SIR WALTERS CONTINUING IN SINGLENESS REQUIRES EXPLANATION. DO NOT BE ALARMED AT IN REALITY THAN SHE DARED NOT LONGER LIVED. RATHER MORE PAINFULLY EXTORTED FROM HER THE SHADES OF HIS EXCEPTING FOR A LITTLE DISORDERED ALWAYS THE FIRST FORTNIGHT AND YET YOU WROTE TO HIM IN POLITICAL CONCERNS TO GET OUT OF THE MARRIAGE OF EDWARD. ESTIMATE ITS BEAUTIES AS HE FELT ALL OVER THE SHOCK AND MORTIFICATION OF FINDING HER STILL MORE ITS SWEET RETIRED BAY BACKED BY DARK CLIFFS WHERE FRAGMENT