## Set Up

## Import libraries

In [2]:
import pandas as pd
import numpy as np

### Import Config

In [3]:
import configparser

config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [4]:
data_home, output_dir

('/Users/jamessiegener/MSDS/DS5001/data',
 '/Users/jamessiegener/MSDS/DS5001/output')

In [5]:
text_file = f"{data_home}/gutenberg/pg42324.txt"
csv_file = f"{output_dir}/austen-combo-TOKENS.csv"

In [6]:
OHCO = ['book_id','chap_num', 'para_num', 'sent_num', 'token_num']
ngrams = 3
widx = [f"w{i}" for i in range(ngrams)]

## Import file into a dataframe

In [7]:
LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()
LINES.sample(20)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
831,
7123,"exhausted, and I should soon have sunk under m..."
3231,We rest; a dream has power to poison sleep.
5262,"banks of this divine river, that I never befor..."
1336,"before the mind can persuade itself that she, ..."
5866,stranger; and I was also disconcerted on perce...
1828,"saw Elizabeth, in the bloom of health, walking..."
809,listener's countenance. At first I perceived t...
5145,"him, and put an end to my slavery for ever."
5525,"penury. The thatch had fallen in, the walls we..."


## Extract Title

In [8]:
title = LINES.loc[0].line_str.replace('The Project Gutenberg EBook of ', '')
title

'Frankenstein, by Mary W. Shelley'

## Clip the Cruft

In [9]:
clip_pats = [
    r"\*\*\*\s*START OF (?:THE|THIS) PROJECT",
    r"\*\*\*\s*END OF (?:THE|THIS) PROJECT"
]

In [10]:
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

In [11]:
line_a = LINES.loc[pat_a].index[0] + 1
line_b = LINES.loc[pat_b].index[0] - 1
line_a, line_b

(19, 7671)

In [12]:
LINES = LINES.loc[line_a: line_b]

In [13]:
LINES.head(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
19,
20,
21,
22,
23,"Produced by Greg Weeks, Mary Meehan and the On..."
24,Distributed Proofreading Team at http://www.pg...
25,
26,
27,
28,


In [14]:
LINES.tail(10)

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
7662,[Transcriber's Note: Possible printer errors c...
7663,"Line 2863: ""I do no not fear to die"" to ""I do ..."
7664,"Line 6375: ""fulfil the wishes of you parents"" ..."
7665,
7666,
7667,
7668,
7669,
7670,End of the Project Gutenberg EBook of Frankens...
7671,


## Chunk by Chapter

### Find all chapter headers

In [15]:
chap_pat = r"^\s*(?:chapter|letter)\s+[IVXLCDM]+"

In [16]:
chap_lines = (LINES.line_str.str.match(chap_pat, case=False) | LINES.line_str.str.contains(r"^\s*PREFACE.\s*$", case=False))

In [17]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
271,PREFACE.
343,LETTER I.
467,LETTER II.
594,LETTER III.
636,LETTER IV.
918,CHAPTER I.
1085,CHAPTER II.
1299,CHAPTER III.
1555,CHAPTER IV.
1789,CHAPTER V.


### Assign numbers to chapters

In [18]:
LINES.loc[chap_lines, 'chap_num'] = [i + 1 for i in range(LINES.loc[chap_lines].shape[0])]

In [19]:
LINES.loc[chap_lines]

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
271,PREFACE.,1.0
343,LETTER I.,2.0
467,LETTER II.,3.0
594,LETTER III.,4.0
636,LETTER IV.,5.0
918,CHAPTER I.,6.0
1085,CHAPTER II.,7.0
1299,CHAPTER III.,8.0
1555,CHAPTER IV.,9.0
1789,CHAPTER V.,10.0


In [20]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
6422,"crimes by my death. Well, be it so; a deadly s...",
6912,aid and conduct me in my work. Let the cursed ...,
2433,these preliminary circumstances; but they were...,
6619,"shore, enjoying the transitory light, and then...",
2785,but it was excited by her generous interferenc...,
6833,,
6989,,
5507,"Having parted from my friend, I determined to ...",
1659,"my story, and you will easily perceive why I a...",
1319,mother sickened; her fever was accompanied by ...,


In [21]:
LINES.chap_num = LINES.chap_num.ffill()

In [22]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
795,"deck, apparently watching for the sledge that ...",5.0
3782,"easily perceived that, although I eagerly long...",17.0
3399,or sight; but I now found that I could wander ...,16.0
5621,tore to pieces the thing on which I was engage...,25.0
5395,the whole nation had forsaken his cause to joi...,24.0
3783,"the cottagers, I ought not to make the attempt...",17.0
6502,"threat returned: nor can you wonder, that, omn...",27.0
7193,themselves who visit him from the regions of a...,29.0
6459,"until then, I conjure you, do not mention or a...",27.0
6374,"""You well know, Victor, that our union had bee...",27.0


In [23]:
LINES.head(20)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
19,,
20,,
21,,
22,,
23,"Produced by Greg Weeks, Mary Meehan and the On...",
24,Distributed Proofreading Team at http://www.pg...,
25,,
26,,
27,,
28,,


In [24]:
LINES = LINES.dropna(subset=['chap_num'])
LINES = LINES.loc[~chap_lines]
LINES.chap_num = LINES.chap_num.astype('int')

In [25]:
LINES.sample(10)

Unnamed: 0_level_0,line_str,chap_num
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1
2752,"innocence.""",13
4914,"true, we shall be monsters, cut off from all t...",22
3741,,17
5929,"spot, and endeavoured, but in vain, to restore...",26
6350,,27
6528,everlasting regret.,27
6321,"wonder, ""My dearest Victor, what infatuation i...",27
5103,gives me more pleasure than I have for some ti...,23
1583,of science. In other studies you go as far as ...,9
6098,"As Mr. Kirwin said this, notwithstanding the a...",26


In [26]:
CHAPS = LINES.groupby(OHCO[1:2]) \
    .line_str.apply(lambda x: '\n'.join(x)) \
    .to_frame('chap_str')

In [27]:
CHAPS.head(10)

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,\n\nThe event on which this fiction is founded...
2,"\n\n_To Mrs. Saville, England._\n\nSt. Petersb..."
3,"\n\n_To Mrs. Saville, England._\n\nArchangel, ..."
4,"\n\n_To Mrs. Saville, England._\n\nMY DEAR SIS..."
5,"\n\n_To Mrs. Saville, England._\n\nAugust 5th,..."
6,\n\nI am by birth a Genevese; and my family is...
7,\n\nWe were brought up together; there was not...
8,"\n\nWhen I had attained the age of seventeen, ..."
9,"\n\nFrom this day natural philosophy, and part..."
10,"\n\nIt was on a dreary night of November, that..."


In [28]:
CHAPS['chap_str'] = CHAPS.chap_str.str.strip()

In [29]:
CHAPS

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"The event on which this fiction is founded, ha..."
2,"_To Mrs. Saville, England._\n\nSt. Petersburgh..."
3,"_To Mrs. Saville, England._\n\nArchangel, 28th..."
4,"_To Mrs. Saville, England._\n\nMY DEAR SISTER,..."
5,"_To Mrs. Saville, England._\n\nAugust 5th, 17-..."
6,I am by birth a Genevese; and my family is one...
7,We were brought up together; there was not qui...
8,"When I had attained the age of seventeen, my p..."
9,"From this day natural philosophy, and particul..."
10,"It was on a dreary night of November, that I b..."


## Split chapters into paragraphs

In [30]:
para_pat = r'\n\n+'
PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack() \
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[1:3]
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"The event on which this fiction is founded, ha..."
1,1,I have thus endeavoured to preserve the truth ...
1,2,The circumstance on which my story rests was s...
1,3,It is a subject also of additional interest to...
1,4,"The weather, however, suddenly became serene; ..."


In [31]:
PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
PARAS['para_str'] = PARAS['para_str'].str.strip()
PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,0,"The event on which this fiction is founded, ha..."
1,1,I have thus endeavoured to preserve the truth ...
1,2,The circumstance on which my story rests was s...
1,3,It is a subject also of additional interest to...
1,4,"The weather, however, suddenly became serene; ..."


## Split paragraphs into sentences

In [32]:
sent_pat = r'[.?!;:]+'
SENTS = PARAS['para_str'].str.split(sent_pat, expand=True).stack() \
    .to_frame('sent_str')
SENTS.index.names = OHCO[1:4]
SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')]
SENTS.sent_str = SENTS.sent_str.str.strip()

In [33]:
SENTS.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,0,0,"The event on which this fiction is founded, ha..."
1,0,1,"Darwin, and some of the physiological writers ..."
1,0,2,I shall not be supposed as according the remot...
1,0,3,"yet, in assuming it as the basis of a work of ..."
1,0,4,The event on which the interest of the story d...
1,0,5,It was recommended by the novelty of the situa...
1,0,6,"and, however impossible as a physical fact, af..."
1,1,0,I have thus endeavoured to preserve the truth ...
1,1,1,"The Iliad, the tragic poetry of Greece,--Shaks..."
1,1,2,"and the most humble novelist, who seeks to con..."


In [34]:
SENTS.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
15,4,4,"The surface is very uneven, rising like the wa..."
29,49,2,Are you then so easily turned from your design
19,6,1,"for I found means, during my residence in the ..."
13,23,10,no
13,20,2,"I believed you guiltless, notwithstanding ever..."
27,24,3,"When reason returned, she would remonstrate, a..."
22,18,12,"if ye really pity me, crush sensation and memory"
19,18,4,"She hesitated some time, but at length she for..."
25,12,8,You can blast my other passions
15,2,7,I looked on the valley beneath


## Split sentences into tokens

In [35]:
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack() \
    .to_frame('token_str')
TOKENS.index.names = OHCO[1:5]
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,The
1,0,0,1,event
1,0,0,2,on
1,0,0,3,which
1,0,0,4,this
...,...,...,...,...
29,86,0,7,Frankenstein
29,86,0,8,by
29,86,0,9,Mary
29,86,0,10,W


In [36]:
TOKENS['term_str'] = TOKENS.token_str.replace(r'[\W_]+', '', regex=True).str.lower()
VOCAB = TOKENS.term_str.value_counts().to_frame('n').reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [37]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the,4252
1,and,2993
2,i,2861
3,of,2687
4,to,2123


In [38]:
VOCAB['n_chars'] = VOCAB.term_str.str.len()
VOCAB['modified_term_str'] = VOCAB.term_str
VOCAB.loc[(VOCAB.n == 1) & (VOCAB.n_chars < 3), 'modified_term_str'] = "<UNK>"

In [39]:
VOCAB

Unnamed: 0_level_0,term_str,n,n_chars,modified_term_str
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the,4252,3,the
1,and,2993,3,and
2,i,2861,1,i
3,of,2687,2,of
4,to,2123,2,to
...,...,...,...,...
7034,peeping,1,7,peeping
7035,pyramids,1,8,pyramids
7036,towered,1,7,towered
7037,bridge,1,6,bridge


In [40]:
TOKENS['modified_term_str'] = TOKENS['term_str'].map(VOCAB.set_index('term_str')['modified_term_str'])

In [41]:
TOKENS 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str,modified_term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,The,the,the
1,0,0,1,event,event,event
1,0,0,2,on,on,on
1,0,0,3,which,which,which
1,0,0,4,this,this,this
...,...,...,...,...,...,...
29,86,0,7,Frankenstein,frankenstein,frankenstein
29,86,0,8,by,by,by
29,86,0,9,Mary,mary,mary
29,86,0,10,W,w,w


In [42]:
TOKENS[TOKENS.modified_term_str == '<UNK>'].sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str,modified_term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23,20,0,1,3,3,<UNK>
15,3,8,5,er,er,<UNK>
26,45,5,6,du,du,<UNK>
5,34,0,1,19,19,<UNK>
15,3,8,4,ne,ne,<UNK>


In [43]:
TOKENS['book_id'] = 0
TOKENS.reset_index(inplace=True)
TOKENS.set_index(['book_id','chap_num', 'para_num', 'sent_num', 'token_num'], inplace=True)
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,modified_term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0,0,0,The,the,the
0,1,0,0,1,event,event,event
0,1,0,0,2,on,on,on
0,1,0,0,3,which,which,which
0,1,0,0,4,this,this,this
0,...,...,...,...,...,...,...
0,29,86,0,7,Frankenstein,frankenstein,frankenstein
0,29,86,0,8,by,by,by
0,29,86,0,9,Mary,mary,mary
0,29,86,0,10,W,w,w


In [44]:
def token_to_padded(token, grouper=['sent_num'], term_str='term_str'):
    ohco = token.index.names # We preserve these since they get lost in the shuffle
    padded = token.groupby(grouper)\
        .apply(lambda x: '<s> ' + ' '.join(x[term_str]) + ' </s>')\
        .apply(lambda x: pd.Series(x.split()))\
        .stack().to_frame('term_str')
    padded.index.names = ohco
    return padded

In [45]:
PADDED = token_to_padded(TOKENS, grouper=OHCO[:4], term_str='modified_term_str')

In [46]:
PADDED

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
0,1,0,0,0,<s>
0,1,0,0,1,the
0,1,0,0,2,event
0,1,0,0,3,on
0,1,0,0,4,which
0,...,...,...,...,...
0,29,86,0,11,w
0,29,86,0,12,</s>
0,29,86,1,0,<s>
0,29,86,1,1,shelley


In [47]:
def padded_to_ngrams(padded, grouper=['sent_num'], n=2):
    
    ohco = padded.index.names
    ngrams = padded.groupby(grouper)\
        .apply(lambda x: pd.concat([x.shift(0-i) for i in range(n)], axis=1))\
        .reset_index(drop=True)
    ngrams.index = padded.index
    ngrams.columns = widx

    # ngrams = pd.concat([padded.shift(0-i) for i in range(n)], axis=1)
    # ngrams.index.name = 'ngram_num'
    # ngrams.columns = widx
    # ngrams = ngrams.fillna('<EOF>')
    
    return ngrams

In [48]:
NGRAMS = padded_to_ngrams(PADDED, OHCO[:4], ngrams)

In [49]:
NGRAMS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,w0,w1,w2
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0,0,0,<s>,the,event
0,1,0,0,1,the,event,on
0,1,0,0,2,event,on,which
0,1,0,0,3,on,which,this
0,1,0,0,4,which,this,fiction
0,...,...,...,...,...,...,...
0,29,86,0,11,w,</s>,
0,29,86,0,12,</s>,,
0,29,86,1,0,<s>,shelley,</s>
0,29,86,1,1,shelley,</s>,


In [50]:
def ngrams_to_models(ngrams):
    global widx
    n = len(ngrams.columns)
    model = [None for i in range(n)]
    for i in range(n):
        if i == 0:
            model[i] = ngrams.value_counts('w0').to_frame('n')
            model[i]['p'] = model[i].n / model[i].n.sum()
            model[i]['i'] = np.log2(1/model[i].p)
        else:
            model[i] = ngrams.value_counts(widx[:i+1]).to_frame('n')    
            model[i]['cp'] = model[i].n / model[i-1].n
            model[i]['i'] = np.log2(1/model[i].cp)
        model[i] = model[i].sort_index()
    return model

In [51]:
M = ngrams_to_models(NGRAMS)

In [52]:
M[2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11th,17,</s>,1,1.0,0.0
11th,the,passage,1,1.0,0.0
12th,17,</s>,1,1.0,0.0
13th,17,</s>,1,1.0,0.0
1816,in,the,1,1.0,0.0
...,...,...,...,...,...
youthful,lovers,have,1,0.5,1.0
youthful,lovers,while,1,0.5,1.0
zeal,modern,philosophers,1,1.0,0.0
zeal,of,felix,1,0.5,1.0


## Question 1: List six words that precede the word "monster," excluding stop words (and sentence boundary markers). Stop words include 'a', 'an', 'the', 'this', 'that', etc.

In [53]:
stop_words = {'a', 'an', 'the', 'this', 'that', 'some'}

monster_rows = M[2].xs('monster', level='w2')

w1_words = monster_rows.index.get_level_values('w1')

# Filter out stop words
preceding_words = pd.DataFrame({'w1': w1_words})

filtered_words = preceding_words[~preceding_words['w1'].isin(stop_words)]

# Get the top 6 results
top_6_words = filtered_words.head(6)
top_6_words

Unnamed: 0,w1
0,abhorred
1,hideous
3,detestable
4,gigantic
5,hellish
20,miserable


The Top 6 words that precede monster are "abhorred," "hideous," "detestable," "gigantic," "hellish," and "miserable."

## Question 2: List the following sentences in ascending order of bigram perplexity according to the language model generated from the text

In [54]:
def sentence_to_token(sent_list, file=True):
    
    # Convert list of sentences to dataframe
    if file:
        S = pd.read_csv("test_sentences.txt", header=None, names=['sent_str'])
    else:
        S = pd.DataFrame(sent_list, columns=['sent_str'])
    S.index.name = 'sent_num'
    
    # Convert dataframe of sentences to TOKEN with normalized terms
    K = S.sent_str.apply(lambda x: pd.Series(x.split())).stack().to_frame('token_str')
    K['term_str'] = K.token_str.str.replace(r"[\W_]+", "", regex=True).str.lower()
    K.index.names = ['sent_num', 'token_num']
    
    return S, K

In [55]:
test_sent, test_token  = sentence_to_token(["The monster is on the ice.",
"Flowers are happy things.",
"I have never seen the aurora borealis.",
"He never knew the love of a family."], file = False)

In [56]:
test_sent

Unnamed: 0_level_0,sent_str
sent_num,Unnamed: 1_level_1
0,The monster is on the ice.
1,Flowers are happy things.
2,I have never seen the aurora borealis.
3,He never knew the love of a family.


In [57]:
test_token

Unnamed: 0_level_0,Unnamed: 1_level_0,token_str,term_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,The,the
0,1,monster,monster
0,2,is,is
0,3,on,on
0,4,the,the
0,5,ice.,ice
1,0,Flowers,flowers
1,1,are,are
1,2,happy,happy
1,3,things.,things


In [58]:
test_token.loc[~test_token.term_str.isin(M[0].index), 'term_str'] = "<UNK>"

In [59]:
test_token[test_token.term_str == '<UNK>'].value_counts('token_str')

token_str
aurora       1
borealis.    1
Name: count, dtype: int64

In [60]:
TEST_PADDED = token_to_padded(test_token)

In [61]:
TEST_PADDED.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,term_str
sent_num,token_num,Unnamed: 2_level_1
0,0,<s>
0,1,the
0,2,monster
0,3,is
0,4,on


In [62]:
TEST_NGRAMS = padded_to_ngrams(TEST_PADDED, 'sent_num', ngrams)

In [63]:
TEST_NGRAMS.loc[0]

Unnamed: 0_level_0,w0,w1,w2
token_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,<s>,the,monster
1,the,monster,is
2,monster,is,on
3,is,on,the
4,on,the,ice
5,the,ice,</s>
6,ice,</s>,
7,</s>,,


In [64]:
def test_model(model, ngrams, sents):
    
    global widx
    
    assert len(model) == len(ngrams.columns)
    
    n = len(model)
    ohco = ngrams.index.names
    
    R = []
    for i in range(n):
        T = ngrams.merge(M[i], on=widx[:i+1], how='left')
        T.index = ngrams.index
        T = T.reset_index().set_index(ohco + widx).i #.to_frame(f"i{i}")
        
        # This how we handle unseen combos
        T[T.isna()] = T.max()
        R.append(T.to_frame(f"i{i}"))
                
    return pd.concat(R, axis=1)

In [65]:
R = test_model(M,TEST_NGRAMS, test_sent)

In [66]:
def compute_perplexity(results, test_sents, n=3):
    for i in range(n):
        test_sents[f"pp{i}"] = np.exp2(results.groupby('sent_num')[f"i{i}"].mean())
    return test_sents

In [67]:
PP = compute_perplexity(R, test_sent)

In [68]:
PP

Unnamed: 0_level_0,sent_str,pp0,pp1,pp2
sent_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,The monster is on the ice.,115.958058,81.30231,70.158484
1,Flowers are happy things.,589.070304,538.786632,187.5
2,I have never seen the aurora borealis.,300.050936,38.891328,82.84911
3,He never knew the love of a family.,171.248897,137.673025,66.085222


In order of ascending bigram perplexity the sentences are as follows: "I have never seen the aurora borealis," "The monster is on the ice," "He never knew the love of a family," "Flowers are happy things."

## Question 3: Using the bigram model represented as a matrix, explore the relationship between bigram pairs using the following lists.

In [86]:
bigram_matrix = M[1]['n'].unstack(level='w1')

first_words = ['he', 'she']

second_words = ['said', 'heard']

selected_bigrams = bigram_matrix.loc[first_words, second_words]
selected_bigrams

w1,said,heard
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
he,21.0,5.0
she,3.0,3.0


### This seems to indicate that male characters are doing more speaking and listening than female characters. While female characters speak and listen an equal amount, male characters are more likely to be speaking than listening. 

## Question 4: Generate 20 sentences using the generate_text() function. Display the results.

In [71]:
def generate_text(M, n=250):
    
    if len(M) < 3:
        raise ValueError("Must have trigram model generated.")
    
    # Start list of words
    first_word = M[1].loc['<s>'].sample(weights='cp').index[0]
    
    words = ['<s>', first_word]
    
    for i in range(n):
        
        bg = tuple(words[-2:])

        # Try trigram model
        try:
            next_word = M[2].loc[bg].sample(weights='cp').index[0]

        # If not found in model, back off ...
        except KeyError as e1:
            try:
                # Get the last word in the bigram
                ug = bg[1]
                next_word = M[1].loc[ug].sample(weights='cp').index[0]
            
            except KeyError as e2:
                next_word = M[0].sample(weights='p').index[0]
                
        words.append(next_word)
    
    
    text = ' '.join(words[2:])
    print('\n\n'.join([str(i+1) + ' ' + line.replace('<s>','')\
        .strip().upper() for i, line in enumerate(text.split('</s>'))]))

In [72]:
generate_text(M)

1 MILDNESS OF HIS SISTER THE SICKENING OPPRESSION OF THE FOLDS OF THE ENQUIRER SEEMED TO SET A CROWN OF DISTINCTION ON HER MOTHER S HOUSE

2 AFTER MY ARRIVAL THEY HAD EVER BEEN IRKSOME TO ME

3 CONSCIENCE AND HEEDED THE STINGS OF REMORSE POISON THE AIR WAS COLD AND THE SEA BECAME FREE FROM BREAKERS

4 OF THE GRAVE WORMS CRAWLING IN THE MELANCHOLY OF HIS THREATS

5 REMEMBERED THE EFFECT THAT THE LETTERS WERE OFTEN PROLONGED BY THE STRANGE SYSTEM OF HUMAN FEELING HAVE RESULTED IN THE GREATEST DELIGHT TO SEE ME AND MY PULSE BEAT WITH A SENSATION OF HELPLESSNESS SO OFTEN PRESENT TO MY OTHER SENSATIONS

6 CONTINUED M

7 YEARS HE HAD FORMED AND ENDUED WITH ANIMATION COULD NOT SLEEP

8 TO FORM ANOTHER BEING OF WHOSE DISPOSITIONS I WAS FIRMLY CONVINCED IN MY OWN FOOD AND BROUGHT THE BODY OF MY VISIT

9 

10 KIND OF CANOPY WHILE WE ENJOYED

11 OR STILL LINGERED IN THE TOWN WERE ALREADY SHUT

12 ONLY THE APPARENTLY INNOCENT ONE OF THESE MEN WERE MOVED

13 

14 DEATH AND IGNOMINY

15 WHICH WAS IN

## Question 5: Compute the redundancy R for each of the n-gram models using the MLE of the joint probability of each ngram type. 

In [82]:
def compute_redundancy(M, n, V):
    
    N = V ** (n + 1)

    H_max = np.log2(N)

    M["p"] = M.n / M.n.sum()

    H = -(M["p"] * np.log2(M["p"])).sum()

    # Compute redundancy
    R = 1 - (H / H_max)

    return R

In [83]:
V = len(M[0].index)

In [84]:
redundancies = {n: compute_redundancy(M[n], n, V) for n in range(3)}

In [85]:
redundancies

{0: 0.30901805293509155, 1: 0.44681094038512525, 2: 0.5876973150492535}

### As the choice of n-gram size increases, the model redundancy also inreases.