# DS 5001 Weed 3 Lab: Baby Babel

Baby Babel is implements the Library of Babel with a small symbol set and message length.

## Set Up

In [None]:
import re
import math
import pandas as pd

In [None]:
%matplotlib inline

## Method 1: Basic Python

### Define the Symbol Set

In [None]:
letters = ['a', 'b', 't', ' ']

### Generate all possible messages of length 6

This is a clumsy but visually effective way to demonstrate how the Library of Babel might have been constructed. It is essentially the cartesian product of the alphabet, multiplying by the length of the message.

In [None]:
books = []
for L1 in letters:
    for L2 in letters:
        for L3 in letters:
            for L4 in letters:
                for L5 in letters:
                    for L6 in letters:
                        books.append(''.join((L1,L2,L3,L4,L5,L6)))

### See how many messages

In [None]:
n = len(books)
n

Should be the same as $a^L$ where $a$ is the size of the symbol set and $L$ is the length  of the message.

In [None]:
len(letters) ** 6

### Look at a sample set of messages (books)

In [None]:
books[:10]

### Get the probability of a book

In [None]:
p_book = 1/n

In [None]:
p_book

### Get the entropy of this source

In [None]:
H = (p_book * math.log(1/p_book, 4)) * n # Of course, in this case, same as math.log(n, 4)
H2 = (p_book * math.log(1/p_book, 2)) * n # Base 2

In [None]:
H, H2

### Find the string `at bat`

In [None]:
result = [i for i, book in enumerate(books) if re.search(r'at bat', book)][0]
result

In [None]:
books[result]

## Method 2: Pandas

### Generate permutation space as MultiIndex

In [None]:
L = 6

In [None]:
X = [letters for i in range(L)]

In [None]:
X

In [None]:
library = pd.DataFrame(index=pd.MultiIndex.from_product(X)).reset_index()

In [None]:
library.head()

In [None]:
library['book'] = library.apply(lambda x: x.str.cat(), 1)

In [None]:
library.book.head(10)

### Find string `at bat`

In [None]:
library[library.book.str.match(r'^at bat$')].index.values[0]

### Generate sample messages

In [None]:
text = ' '.join(library.book.sample(100).values)

In [None]:
text = re.sub(r'\s+', ' ', text) # Collapse spaces

In [None]:
text

### Generate messages from alphabet sampling

In [None]:
B = pd.Series(letters)

In [None]:
B.sample(5, replace=True).str.cat()

## Create Big Babel

In [None]:
alpha = pd.Series('_ a b c d e f g h i j k l m n o p q r s t u v w x y z'.split()).str.replace('_',' ')

In [None]:
# alpha

In [None]:
def get_message(m_len = 10, weights=[]):
    if len(weights) == 0:
        p = 1/alpha.shape[0]
        weights = [p for i in range(alpha.shape[0])]
    msg = alpha.sample(m_len, replace=True, weights=weights).str.cat()
    return msg

def print_page(n_pages = 1, weights=[]):
    for i in range(n_pages):
        for i in range(40):
            print(get_message(80, weights))
        print()
        print('-'*80)
        print()

In [None]:
print_page(2)

## Add a Language Model from a Novel

### Parse text into characters

In [None]:
epub_file = '../2020-01-23/austen-persuasion.csv'
text = pd.read_csv(epub_file).token_str.astype('str').str.lower()

In [None]:
# text

In [None]:
chars = pd.Series(list(text.str.cat(sep=' ')))

In [None]:
# chars

In [None]:
chars = chars[chars.isin(alpha)]

In [None]:
# chars

### Create Unigram character language model

In [None]:
LM = chars.value_counts().to_frame().rename(columns={0:'n'})
LM.index.name = 'char'

In [None]:
LM

In [None]:
LM['p'] = LM['n'] / LM['n'].sum()
LM = LM.sort_index()

In [None]:
LM.sort_values('p', ascending=False)

In [None]:
LM.p.sort_values().plot(kind='barh', figsize=(10,15));

#### Generate text

In [None]:
weights = LM.p.values.tolist()

In [None]:
print_page(1, weights)

### Create Bigram Model

In [None]:
chars1 = chars.reset_index().rename(columns={'index':'offset', 0:'char'}).copy()

In [None]:
# chars1

In [None]:
chars1['offset_1'] = chars1.offset + 1

In [None]:
# chars1

In [None]:
bigrams = chars1.merge(chars1, how='inner', right_on='offset', left_on='offset_1')[['char_x', 'char_y']].fillna(' ')

In [None]:
bigrams

In [None]:
bigrams.head(10)

In [None]:
bigram_model = bigrams.groupby(['char_x', 'char_y']).char_y.count().to_frame().rename(columns={'char_y':'n'})

In [None]:
bigram_model

In [None]:
bigram_model.loc['q']

In [None]:
bigram_model.loc['t'].sort_values('n', ascending=False)

In [None]:
bigram_model['p'] = bigram_model.groupby('char_x').apply(lambda x: x / x.sum())

In [None]:
bigram_model.loc['q']

In [None]:
BGM = bigram_model.p.unstack().fillna(0.00001)
BGM.index.name = 'if'
BGM.columns.name = 'then'

In [None]:
BGM

In [None]:
BGM.loc['q'].plot(kind='bar');

In [None]:
BGM

In [None]:
BGM.loc['q'].idxmax()

### Get entropy of characters as antecendents

In [None]:
import numpy as np

In [None]:
BGM.apply(lambda x: -sum(x * np.log2(x)), 1).sort_values(ascending=False).plot(kind='barh', figsize=(10,10));

#### Define function to sample a single letter

In [None]:
def get_letter(weights=[]):
    if len(weights) == 0:
        p = 1/alpha.shape[0]
        weights = [p for i in range(alpha.shape[0])]
    msg = alpha.sample(1, replace=True, weights=weights).values[0]
    return msg

In [None]:
# Test
get_letter(weights=BGM.loc['q'].values)

#### Generate text

In [None]:
def print_page2(n_pages=1):
    n = 80 * 40 * n_pages
    ltrs = []
    ltrs.append(get_letter())
    for i in range(n):
        ltrs.append(get_letter(weights=BGM.loc[ltrs[-1]].values))
    txt = ''.join(ltrs)
    lines = 0
    for i in range(0, len(txt), 80):
        print(txt[i:i + 80])
        if i % 80 == 0:
            lines += 1
        if i > 0 and lines % 40 == 1:
            print()
            print('-' * 80)
            print()

In [None]:
print_page2(2)