# DS 5001 Week 3 Lab: Inferring Language Models

We now create a series of langage models and evaluate them.

## Set Up

### Configure

In [None]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
text_file1 = 'austen-persuasion.csv'
text_file2 = 'austen-sense.csv'

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

## Import and combine texts

In [None]:
text1 = pd.read_csv(text_file1)
text2 = pd.read_csv(text_file2)

In [None]:
text1.head(10)

In [None]:
text1['book_id'] = 1
text2['book_id'] = 2

In [None]:
text1.head()

In [None]:
tokens = pd.concat([text1, text2]).dropna()

In [None]:
tokens = tokens.set_index(OHCO)

In [None]:
tokens.head()

## Create a vocabulary

In [None]:
tokens['term_str'] = tokens['token_str'].str.lower().str.replace(r'[\W_]', '')

In [None]:
tokens.head()

In [None]:
vocab = tokens['term_str'].value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'term_str':'n', 'index':'term_str'})\
    .sort_values('term_str')
vocab.index.name = 'term_id'

In [None]:
vocab.head()

In [None]:
vocab.sample(5)

## Simple Unigram Model

In [None]:
n_tokens = vocab.n.sum()
vocab['p'] = vocab['n'] / n_tokens
vocab['log_p'] = np.log2(vocab['p'])

In [None]:
n_tokens

In [None]:
vocab.sort_values('p', ascending=False).head(10)

In [None]:
smooth = vocab['p'].min()
def predict_sentence(sent_str):
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(sent_str.lower().split(), columns=['term_str'])
    
    # Link the tokens with model vocabulary
    tokens = tokens.merge(vocab, on='term_str', how='left') # Left join is key
    
    # Add minimum values where token is not in our vocabulary
    tokens.loc[tokens['p'].isna(), 'p'] = [smooth]
    
    # Compute probability of sentence by getting product of token probabilities
    p = tokens['p'].product()
        
    # Print results
    print("p('{}') = {}".format(sent_str, p))

In [None]:
predict_sentence('I love you')
predict_sentence('I love cars')
predict_sentence("I want to")
predict_sentence("anne said to")
predict_sentence("said to her")
predict_sentence('said to him')

## N-Gram models

This function generates models up to the length specified.

In [None]:
def get_ngrams(tokens, n=2):
    
    global OHCO
    
    # Create list to store copies of tokens table
    X = []
    
    # Convert the index to cols in order to change the value of token_num
    X.append(tokens['term_str'].reset_index())
        
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=OHCO, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(OHCO, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
            
    # Return just the ngram tables
    return X

### Generate three models

Unigram, bigram, and trigram

In [None]:
m1, m2, m3 = get_ngrams(tokens, n=3)

In [None]:
# m3.sort_values('n', ascending=False).head(10)

### Compute joint probabilities

In [None]:
m1['p'] = m1['n'] / m1['n'].sum()
m2['p'] = m2['n'] / m2['n'].sum()
m3['p'] = m3['n'] / m3['n'].sum()

In [None]:
m1.sort_values('p', ascending=False).head()

In [None]:
m2.sort_values('p', ascending=False).head()

In [None]:
m3.sort_values('p', ascending=False).head(15)

### Compute conditional probabilities

$p(w_1|w_0) = p(w_0, w_1) / p(w_0)$

$p(w_2|w_0,w_1) = p(w_0, w_1, w_2) / p(w_0, w_1)$

In [None]:
m2m = m2.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

In [None]:
m3m = m3.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

## Predict Sentences

In [None]:
def predict_sentence2(sent_str, n=2):
    
    # Pick appropriate model
    global m1, m2, m3
    if n == 1:
        M = m1
    elif n == 2:
        M = m2
    elif n == 3:
        M = m3
    else:
        return False
    
    # Get smoothing 
    smooth = M.p.min()
    
    # Add sentence padding (Hacky)
    padded_sent_str = sent_str + (' <s>' * (n-1))
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(padded_sent_str.lower().split(), columns=['term_str'])
    
    # Generate ngram keys 
    ngrams = []
    offset = n - 1
    for i in range(offset, tokens.shape[0]):
        ngram = []
        w = tokens.iloc[i].term_str
        for j in range(n):
            ngram.append(tokens.iloc[i-j].term_str)
        ngram.reverse()
        ngrams.append(ngram)
        
    # Compute the probability of the sentence
    L = 0
    for ngram in ngrams:
        try:
            p_ngram = M.loc[tuple(ngram)].p
        except KeyError:
            p_ngram = smooth
        L += np.log2(p_ngram)
    P = np.exp(L)
    
    print(sent_str, P)

In [None]:
predict_sentence2('I love you', 1)
predict_sentence2('I love cars', 1)
predict_sentence2("I want to", 1)
predict_sentence2("anne said to", 1)
predict_sentence2("said to her", 1)
predict_sentence2('said to him', 1)

In [None]:
predict_sentence2('I love you', 2)
predict_sentence2('I love cars', 2)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 2)
predict_sentence2("said to her", 2)
predict_sentence2('said to him', 2)

In [None]:
predict_sentence2('I love you', 3)
predict_sentence2('I love cars', 3)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 3)
predict_sentence2("said to her", 3)
predict_sentence2('said to him', 3)

## Explore

In [None]:
m2m.loc[['he','she','it','anne','wentworth'], 
        ['is','had','was','felt','thought','looked','said','saw']]\
    .style.background_gradient(cmap='Greens')

In [None]:
m2m.loc[['he','she'],['felt','said']].style.background_gradient(cmap='Greens')

## Generate Text

We use back-off to account for missing ngrams.

In [None]:
def generate_text(start_word='she', n=250):
    words = [start_word]
    for i in range(n):
        if len(words) == 1:
            w = m2m.loc[start_word]
            next_word = m2m.loc[start_word].sample(weights=w).index.values[0]
        elif len(words) > 1:
            bg = tuple(words[-2:])
            try:
                w = m3m.loc[bg]
                next_word = m3m.loc[bg].sample(weights=w).index.values[0]
            except KeyError:
                ug = bg[1]
                if ug == '<s>':
                    next_word = m1.sample(weights=m1.p).index[0]
                else:
                    w = m2m.loc[ug]
                    next_word = m2m.loc[ug].sample(weights=w).index.values[0]
        words.append(next_word)
    text = ' '.join(words)
    text = text.replace(' <s> <s>', '.') + '.'
    text = text.upper() # To give that telegraph message look :-)
    print(text)

In [None]:
generate_text('the')

In [None]:
generate_text('she')