# DS 5001 Week 3 Lab: Inferring Language Models

We now create a series of langage models and evaluate them.

## Set Up

### Configure

In [1]:
data_in = "./data_in"
data_out = './data_out'

In [2]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
text_file1 = data_in + '/austen-persuasion.csv'
text_file2 = data_in + '/austen-sense.csv'

### Import libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

## Import and combine texts

In [4]:
text1 = pd.read_csv(text_file1)
text2 = pd.read_csv(text_file2)

In [5]:
text1.head(10)

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str
0,1,1,0,0,Sir
1,1,1,0,1,Walter
2,1,1,0,2,Elliot
3,1,1,0,3,of
4,1,1,0,4,Kellynch
5,1,1,0,5,Hall
6,1,1,0,6,in
7,1,1,0,7,Somersetshire
8,1,1,0,8,was
9,1,1,0,9,a


In [6]:
text1['book_id'] = 1
text2['book_id'] = 2

In [7]:
text1.head()

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,book_id
0,1,1,0,0,Sir,1
1,1,1,0,1,Walter,1
2,1,1,0,2,Elliot,1
3,1,1,0,3,of,1
4,1,1,0,4,Kellynch,1


In [8]:
tokens = pd.concat([text1, text2]).dropna()

In [9]:
tokens = tokens.set_index(OHCO)

In [10]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
1,1,1,0,0,Sir
1,1,1,0,1,Walter
1,1,1,0,2,Elliot
1,1,1,0,3,of
1,1,1,0,4,Kellynch


## Create a vocabulary

In [11]:
tokens['term_str'] = tokens['token_str'].str.lower().str.replace(r'[\W_]', '')

In [12]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,Sir,sir
1,1,1,0,1,Walter,walter
1,1,1,0,2,Elliot,elliot
1,1,1,0,3,of,of
1,1,1,0,4,Kellynch,kellynch


In [13]:
vocab = tokens['term_str'].value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'term_str':'n', 'index':'term_str'})\
    .sort_values('term_str')
vocab.index.name = 'term_id'

In [14]:
vocab.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
758,,29
3631,1.0,3
5949,15.0,1
6997,16.0,1
8028,1760.0,1


In [15]:
vocab.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3168,holds,4
681,enjoyment,33
2505,nursery,6
8104,slighter,1
7037,perpetually,1


## Simple Unigram Model

### Compute Term probabilities

In [16]:
n_tokens = vocab.n.sum()
vocab['p'] = vocab['n'] / n_tokens
vocab['i'] = np.log2(1/vocab['p'])

In [17]:
n_tokens

204833

### Compute Entropy and of the Vocabulary

Why not?

In [18]:
n_terms = vocab.shape[0]
H = (vocab.p * vocab.i).sum()
Hmax = np.log2(n_terms)
R = 1 - (H/Hmax)

In [19]:
R

0.2965421207433586

So, the redundancy of Austen's English from these two novels $R_{austen}$ is about $30\%$. Shannon estimated the redundancy of English $R_{english}$ to be $54\%$ (see Shannon 1953 in the Readings).

In [20]:
vocab.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,term_str,n,p,i
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the,7436,0.036303,4.783778
1,to,6924,0.033803,4.886699
2,and,6290,0.030708,5.025244
3,of,6145,0.03,5.058891
4,her,3747,0.018293,5.772568
5,a,3687,0.018,5.795857
6,in,3368,0.016443,5.926412
7,was,3198,0.015613,6.001134
8,i,3128,0.015271,6.033064
9,it,2795,0.013645,6.195456


In [21]:
smooth = vocab['p'].min()
def predict_sentence(sent_str):
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(sent_str.lower().split(), columns=['term_str'])
    
    # Link the tokens with model vocabulary
    tokens = tokens.merge(vocab, on='term_str', how='left') # Left join is key
    
    # Add minimum values where token is not in our vocabulary
    tokens.loc[tokens['p'].isna(), 'p'] = [smooth]
    
    # Compute probability of sentence by getting product of token probabilities
    p = tokens['p'].product()
        
    # Print results
    print("p('{}') = {}".format(sent_str, p))

In [22]:
predict_sentence('I love you')
predict_sentence('I love cars')
predict_sentence("I want to")
predict_sentence("anne said to")
predict_sentence("said to her")
predict_sentence('said to him')

p('I love you') = 7.878556023336425e-08
p('I love cars') = 4.3312567472987495e-11
p('I want to') = 1.8649008463478524e-07
p('anne said to') = 2.3099369325723746e-07
p('said to her') = 1.7207422835683278e-06
p('said to him') = 5.092882819528357e-07


## N-Gram models

This function generates models up to the length specified.

In [23]:
def get_ngrams(tokens, n=2):
    
    global OHCO
    
    # Create list to store copies of tokens table
    X = []
    
    # Convert the index to cols in order to change the value of token_num
    X.append(tokens['term_str'].reset_index())
        
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=OHCO, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(OHCO, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
            
    # Return just the ngram tables
    return X

### Generate three models

Unigram, bigram, and trigram

In [24]:
m1, m2, m3 = get_ngrams(tokens, n=3)

In [25]:
# m3.sort_values('n', ascending=False).head(10)

### Compute joint probabilities

In [26]:
m1['p'] = m1['n'] / m1['n'].sum()
m2['p'] = m2['n'] / m2['n'].sum()
m3['p'] = m3['n'] / m3['n'].sum()

In [27]:
m1.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7436,0.036303
to,6924,0.033803
and,6290,0.030708
of,6145,0.03
her,3747,0.018293


In [28]:
m2.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1
of,the,857,0.004184
to,be,814,0.003974
in,the,683,0.003334
mrs,<s>,530,0.002587
it,was,498,0.002431


In [29]:
m3.sort_values('p', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1
mrs,<s>,<s>,530,0.002587
it,<s>,<s>,369,0.001801
her,<s>,<s>,244,0.001191
him,<s>,<s>,229,0.001118
mr,<s>,<s>,179,0.000874
you,<s>,<s>,172,0.00084
them,<s>,<s>,163,0.000796
me,<s>,<s>,162,0.000791
elinor,<s>,<s>,119,0.000581
i,am,sure,107,0.000522


### Compute conditional probabilities

$p(w_1|w_0) = p(w_0, w_1) / p(w_0)$

$p(w_2|w_0,w_1) = p(w_0, w_1, w_2) / p(w_0, w_1)$

In [30]:
m2x = m2.groupby('w0')[['n']].apply(lambda x: x.n.sum())
m3x = m3.groupby(['w0','w1'])[['n']].apply(lambda x: x.n.sum())

In [31]:
m2m = (m2.n / m2x).to_frame('p').sort_index()

In [32]:
m3m = (m3.n / m3x).sort_values().to_frame('p').sort_index()

In [33]:
# There are inefficient and produce huge files
# m2m = m2.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)
# m3m = m3.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

## Predict Sentences

In [34]:
def predict_sentence2(sent_str, n=2):
    
    # Pick appropriate model
    global m1, m2, m3
    if n == 1:
        M = m1
    elif n == 2:
        M = m2
    elif n == 3:
        M = m3
    else:
        return False
    
    # Get smoothing 
    smooth = M.p.min()
    
    # Add sentence padding (Hacky)
    padded_sent_str = sent_str + (' <s>' * (n-1))
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(padded_sent_str.lower().split(), columns=['term_str'])
    
    # Generate ngram keys 
    ngrams = []
    offset = n - 1
    for i in range(offset, tokens.shape[0]):
        ngram = []
        w = tokens.iloc[i].term_str
        for j in range(n):
            ngram.append(tokens.iloc[i-j].term_str)
        ngram.reverse()
        ngrams.append(ngram)
        
    # Compute the probability of the sentence
    L = 0
    for ngram in ngrams:
        try:
            p_ngram = M.loc[tuple(ngram)].p
        except KeyError:
            p_ngram = smooth
        L += np.log2(p_ngram)
    P = np.exp(L)
    
    print(sent_str, P)

In [35]:
predict_sentence2('I love you', 1)
predict_sentence2('I love cars', 1)
predict_sentence2("I want to", 1)
predict_sentence2("anne said to", 1)
predict_sentence2("said to her", 1)
predict_sentence2('said to him', 1)

I love you 5.645972739472476e-11
I love cars 1.118907816687782e-15
I want to 1.9570792682414204e-10
anne said to 2.6650097828995353e-10
said to her 4.829429322644128e-09
said to him 8.338111808245719e-10


In [36]:
predict_sentence2('I love you', 2)
predict_sentence2('I love cars', 2)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 2)
predict_sentence2("said to her", 2)
predict_sentence2('said to him', 2)

I love you 1.6912924832811006e-18
I love cars 2.0639180372517065e-22
I want to 2.0994247126049545e-19
anne said to 7.112019880991409e-20
said to her 7.131778675619001e-15
said to him 1.2983543385819788e-15


In [37]:
predict_sentence2('I love you', 3)
predict_sentence2('I love cars', 3)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 3)
predict_sentence2("said to her", 3)
predict_sentence2('said to him', 3)

I love you 1.725817247418853e-20
I love cars 1.0275642842631827e-23
I want to 2.0994247126049545e-19
anne said to 1.1935219350244338e-21
said to her 6.065124721977218e-18
said to him 9.586541118024097e-18


## Explore

In [38]:
def explore_pairs(list1, list2):
    global m2m
    test_pairs = []
    for x in list1:
        for y in list2:
            pair = (x, y)
            try:
                m2m.loc[pair]
                test_pairs.append((x, y))
            except:
                pass
    return m2m.loc[test_pairs].unstack(fill_value=0).style.background_gradient(cmap='Greens')

In [39]:
explore_pairs(['he','she','it','anne','wentworth'], ['is','had','was','did','felt','thought','looked','said','saw'])

Unnamed: 0_level_0,p,p,p,p,p,p,p,p,p
w1,is,had,was,did,felt,thought,looked,said,saw
w0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
he,0.057803,0.14499,0.120906,0.030829,0.005299,0.004817,0.008671,0.016378,0.005299
she,0.024284,0.148967,0.135194,0.021385,0.018123,0.009786,0.005074,0.010511,0.015948
it,0.096959,0.02254,0.178175,0.007871,0.000358,0.000358,0.000358,0.000358,0.0
anne,0.003976,0.075547,0.089463,0.015905,0.017893,0.003976,0.001988,0.001988,0.005964
wentworth,0.013761,0.036697,0.082569,0.004587,0.0,0.0,0.004587,0.009174,0.004587


In [40]:
explore_pairs(['he', 'she', 'it'], ['said','felt'])

Unnamed: 0_level_0,p,p
w1,said,felt
w0,Unnamed: 1_level_2,Unnamed: 2_level_2
he,0.016378,0.005299
she,0.010511,0.018123
it,0.000358,0.000358


## Generate Text

We use back-off to account for missing ngrams.

In [41]:
def generate_text(start_word='she', n=250):
    words = [start_word]
    for i in range(n):
        if len(words) == 1:
            w = m2m.loc[start_word].p
            next_word = m2m.loc[start_word].sample(weights=w).index.values[0]
        elif len(words) > 1:
            bg = tuple(words[-2:])
            try:
                w = m3m.loc[bg].p
                next_word = m3m.loc[bg].sample(weights=w).index.values[0]
            except KeyError:
                ug = bg[1]
                if ug == '<s>':
                    next_word = m1.sample(weights=m1.p).index[0]
                else:
                    w = m2m.loc[ug].p
                    next_word = m2m.loc[ug].sample(weights=w).index.values[0]
        words.append(next_word)
    text = ' '.join(words)
    text = text.replace(' <s> <s>', '.') + '.'
    text = text.upper() # To give that telegraph message look :-)
    print(text)

In [42]:
generate_text('the')

THE STREIGHTS AND NEVER WAS. KNEW. FULL SENSATION CHARLES MARY ANNE HENRIETTA LOUISA AND NOT VERY NEAR. AND MRS. WITH HER SISTER S ENTREATIES AND PROMISE TO HIS LARGE FISHING NET AT ONE TIME ATTACHED TO WOMAN THAN POOR BENWICK HAD BEEN IN MY BEING ACQUAINTED WITH THE SIZE AND MENTAL ALACRITY DID NOT QUITE UNCONNECTED IN THIS PART OF ENGLAND ACCOMPANYING HER HUSBAND. ADMIRE HER MORE COMFORTABLE AND COMPACT. IT IS TO BECOME OF HIM. WORLDLY MAN WHO NOT LONG BE SO IMPROVIDENT IN A SPUNGING HOUSE WHERE THE DEVIATION IS NECESSARY TO KEEP OFF THE SUBJECT. TO MAKE ONE SON. WHY CANNOT I CANNOT SUPPOSE IT POSSIBLE THAT SHE MIGHT THINK NECESSARY FOR ME. TO THE ADVANCED AGE OF BLUSHING. MADE NO RESISTANCE THAT WAS GIVEN AND SIR WALTER COULD MATERIALLY ALTER HIS STYLE OF EQUAL SOLICITUDE ON TOPICS WHICH HAD ALWAYS ADMITTED A HOPE WHILE EDWARD WAS ALLOWED TO STIR AND TRIED TO BE SPARED FROM THE STILES. COTTAGE TO TELL HIM WILL DO EVERYTHING. AND TO OFFER SOME KIND OF A FAR MORE INCURABLE NATURE. DEL

In [43]:
generate_text('she')

SHE COULD NOT BE DECEIVED IN THAT FLOW OF THE EVENT OF HIS GOOD HUMOURED ACQUIESCENCE. PART BY OUR LONG VERY LONG ABSENCE SINCE WE PARTED IF THAT S ALL. ADAPTED BY RESEMBLANCE OF DISPOSITION. I HAD FULLY INTENDED TO MARRY JAMES BENWICK IS VERY FINE OBJECT FROM MANY PARTS OF THEM. HE IS IN SUCH MOMENTS OF COMMUNICATIONS CONTINUALLY OCCURRING AND ALWAYS THE HOPE OF EXCITING. EXCUSE THE LIBERTY I TAKE IT SO. I OWED TO THEM. SHE WAS SURE THAT HE SOMETIMES TOOK OUT HER HAND WHICH SHE COULD SAY TO HER DAUGHTER. THAT THEY HAD LEFT THE DINING ROOM AND SAID. THINGS ON THE GROUND. HEIGHTENED COLOUR AND AN OFFICER WHOM HE HAD NEVER WITNESSED IN HIM I AM AFRAID THERE HAD BEEN A GOOD EXCUSE AND HE TOLD ME SO BUSY HAVE HAD TO ENQUIRE AFTER MARIANNE WAS IN THE HAPPINESS WHICH MADE HER DAILY COMPLAINT. TO HIS MOTHER IN LAW S CONCERNS COULD NOT HEAR OF OUR ACQUAINTANCE FIRST BEGAN. DO BETWEEN YOU ABOUT BUT NEVER HAD SHE BEEN CONSCIOUS OF NOT MARRYING TILL EVERY THING EQUAL TO ANY OTHER WAY. HAD BEEN OR

## Save

In [44]:
vocab.to_csv("{}/austen-VOCAB.csv".format(data_out))
tokens.to_csv("{}/austen-TOKENS.csv".format(data_out))
m1.to_csv("{}/austen-M1.csv".format(data_out))
m2.to_csv("{}/austen-M2.csv".format(data_out))
m3.to_csv("{}/austen-M3.csv".format(data_out))
m2m.to_csv("{}/austen-M2M.csv".format(data_out))
m3m.to_csv("{}/austen-M3M.csv".format(data_out))