In [1]:
import os
import pandas as pd
import numpy as np
from language_helper.markov.model import MarkovNgrams
from language_helper.data import Processing

  from .autonotebook import tqdm as notebook_tqdm


### 1. Ingest Dataset

In [2]:
# define file path to load
path = "dataset/"
domain = "chat"
file = "dialogs.txt"
file_path = os.path.join(path,domain,file)

In [3]:
# read text data
with open(file_path, mode="r") as file:
    rows = file.readlines()
    rows = [row.replace("\t"," ").replace("\n", "") for row in rows]


In [4]:
rows[0:10]

["hi, how are you doing? i'm fine. how about yourself?",
 "i'm fine. how about yourself? i'm pretty good. thanks for asking.",
 "i'm pretty good. thanks for asking. no problem. so how have you been?",
 "no problem. so how have you been? i've been great. what about you?",
 "i've been great. what about you? i've been good. i'm in school right now.",
 "i've been good. i'm in school right now. what school do you go to?",
 'what school do you go to? i go to pcc.',
 'i go to pcc. do you like it there?',
 "do you like it there? it's okay. it's a really big campus.",
 "it's okay. it's a really big campus. good luck with school."]

In [5]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/mynguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
sentences = [nltk.sent_tokenize(row)[0] for row in rows]
sentences[:10]

['hi, how are you doing?',
 "i'm fine.",
 "i'm pretty good.",
 'no problem.',
 "i've been great.",
 "i've been good.",
 'what school do you go to?',
 'i go to pcc.',
 'do you like it there?',
 "it's okay."]

## 3.1 Markov Chain

In [7]:
markov = MarkovNgrams()
markov.make_model(batch_sentences=sentences, n_grams=3)

last word should be the next word used for prediction, or target: how
prev words ('hi', ',')
last word should be the next word used for prediction, or target: are
prev words (',', 'how')
last word should be the next word used for prediction, or target: you
prev words ('how', 'are')
last word should be the next word used for prediction, or target: doing
prev words ('are', 'you')
last word should be the next word used for prediction, or target: ?
prev words ('you', 'doing')
last word should be the next word used for prediction, or target: fine
prev words ('i', "'m")
last word should be the next word used for prediction, or target: .
prev words ("'m", 'fine')
last word should be the next word used for prediction, or target: pretty
prev words ('i', "'m")
last word should be the next word used for prediction, or target: good
prev words ("'m", 'pretty')
last word should be the next word used for prediction, or target: .
prev words ('pretty', 'good')
last word should be the next word used for

In [8]:
input = "pretty"
chosen_word, p_chosen_word = markov.predict_next_word(input=input)

['pretty']
good
good
.
light
.
talented
busy
awesome
big
big
old
old
.
woman
women
woman
.
?
.
eyes
hungry
fast
simple
old


In [9]:
chosen_word

'.'

In [12]:
markov.result_hash_map

{'good': [2, 0.08333333333333333],
 '.': [5, 0.20833333333333334],
 'light': [1, 0.041666666666666664],
 'talented': [1, 0.041666666666666664],
 'busy': [1, 0.041666666666666664],
 'awesome': [1, 0.041666666666666664],
 'big': [2, 0.08333333333333333],
 'old': [3, 0.125],
 'woman': [2, 0.08333333333333333],
 'women': [1, 0.041666666666666664],
 '?': [1, 0.041666666666666664],
 'eyes': [1, 0.041666666666666664],
 'hungry': [1, 0.041666666666666664],
 'fast': [1, 0.041666666666666664],
 'simple': [1, 0.041666666666666664]}