In [1]:
import os
import pandas as pd
import numpy as np
from language_helper.markov.model import MarkovNgrams
from language_helper.markov.data import Processing

  from .autonotebook import tqdm as notebook_tqdm


### 1. Ingest Dataset

In [2]:
# define file path to load
path = "dataset/"
domain = "chat"
file = "dialogs.txt"
file_path = os.path.join(path,domain,file)

In [3]:
# read text data
with open(file_path, mode="r") as file:
    rows = file.readlines()
    rows = [row.replace("\t"," ").replace("\n", "") for row in rows]


In [4]:
# Loading Data
processor = Processing(data_domain="chat")
dataframe = processor.load()
processor.prepare(dataframe, "data")

# print(f"Size of Dataset: {len(processor.list_of_tokens)}")

[nltk_data] Downloading package punkt to /Users/mynguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading...<_io.TextIOWrapper name='./dataset/chat/dialogs.txt' mode='r' encoding='UTF-8'>
3725 rows and 1 columns appended
Completed: 3725 rows and 1 columns loaded
Columns Index(['data'], dtype='object')


## 3.1 Markov Chain

In [5]:
processor.list_of_tokens[0]

['yes',
 'ive',
 'never',
 'been',
 'there',
 'i',
 'was',
 'there',
 'when',
 'i',
 'was',
 'a',
 'kid',
 '</s>']

In [6]:
markov = MarkovNgrams()
markov.make_model(batch_sentences=processor.list_of_tokens, n_grams=3)

last word should be the next word used for prediction, or target: never
prev words ('yes', 'ive')
last word should be the next word used for prediction, or target: been
prev words ('ive', 'never')
last word should be the next word used for prediction, or target: there
prev words ('never', 'been')
last word should be the next word used for prediction, or target: i
prev words ('been', 'there')
last word should be the next word used for prediction, or target: was
prev words ('there', 'i')
last word should be the next word used for prediction, or target: there
prev words ('i', 'was')
last word should be the next word used for prediction, or target: when
prev words ('was', 'there')
last word should be the next word used for prediction, or target: i
prev words ('there', 'when')
last word should be the next word used for prediction, or target: was
prev words ('when', 'i')
last word should be the next word used for prediction, or target: a
prev words ('i', 'was')
last word should be the next w

In [7]:
input = "pretty"
chosen_word, p_chosen_word = markov.predict_next_word(input=input)

['pretty']


In [8]:
chosen_word

'good'

In [12]:
markov.result_hash_map

{'good': [2, 0.08333333333333333],
 '.': [5, 0.20833333333333334],
 'light': [1, 0.041666666666666664],
 'talented': [1, 0.041666666666666664],
 'busy': [1, 0.041666666666666664],
 'awesome': [1, 0.041666666666666664],
 'big': [2, 0.08333333333333333],
 'old': [3, 0.125],
 'woman': [2, 0.08333333333333333],
 'women': [1, 0.041666666666666664],
 '?': [1, 0.041666666666666664],
 'eyes': [1, 0.041666666666666664],
 'hungry': [1, 0.041666666666666664],
 'fast': [1, 0.041666666666666664],
 'simple': [1, 0.041666666666666664]}