In [1]:
# references https://www.analyticsvidhya.com/blog/2020/08/build-a-natural-language-generation-nlg-system-using-pytorch/
# https://towardsdatascience.com/next-word-prediction-with-nlp-and-deep-learning-48b9fe0a17bf
# https://www.kaggle.com/code/ashishpatel26/beginner-to-intermediate-nlp-tutorial

import os
import pandas as pd
import numpy as np

### 1. Ingest Dataset

In [2]:
# define file path to load
path = "../dataset/"
domain = "chat"
file = "dialogs.txt"
file_path = os.path.join(path,domain,file)

In [3]:
# read text data
with open(file_path, mode="r") as file:
    rows = file.readlines()
    rows = [row.replace("\t"," ").replace("\n", "") for row in rows]


In [4]:
rows[0:10]

["hi, how are you doing? i'm fine. how about yourself?",
 "i'm fine. how about yourself? i'm pretty good. thanks for asking.",
 "i'm pretty good. thanks for asking. no problem. so how have you been?",
 "no problem. so how have you been? i've been great. what about you?",
 "i've been great. what about you? i've been good. i'm in school right now.",
 "i've been good. i'm in school right now. what school do you go to?",
 'what school do you go to? i go to pcc.',
 'i go to pcc. do you like it there?',
 "do you like it there? it's okay. it's a really big campus.",
 "it's okay. it's a really big campus. good luck with school."]

### 2. Morphological Analysis

### 3. Lexical Analysis

In [5]:
import nltk
from nltk import word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/mynguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
sentences = [nltk.sent_tokenize(row)[0] for row in rows]

In [7]:
sentences[:10]

['hi, how are you doing?',
 "i'm fine.",
 "i'm pretty good.",
 'no problem.',
 "i've been great.",
 "i've been good.",
 'what school do you go to?',
 'i go to pcc.',
 'do you like it there?',
 "it's okay."]

In [8]:
# for each sentences
word_token = nltk.word_tokenize(sentences[0])



In [9]:
word_token

['hi', ',', 'how', 'are', 'you', 'doing', '?']

In [10]:
dataframe = pd.DataFrame(sentences, columns=["text"]).reset_index()

In [11]:
dataframe.value_counts()

index  text                                          
0      hi, how are you doing?                            1
2488   how much were they?                               1
2476   yeah.                                             1
2477   i want to be a baseball player when i grow up.    1
2478   me too.                                           1
                                                        ..
1246   some men do, but not me.                          1
1247   i'm watching you.                                 1
1248   i'm an open book.                                 1
1249   if i catch you, you'll be sorry.                  1
3724   but i do all my writing with my right hand.       1
Length: 3725, dtype: int64

In [12]:
tokens = word_tokenize(rows[0])
print(f"Input: {rows[0]}")
print(f"token: {tokens}")

Input: hi, how are you doing? i'm fine. how about yourself?
token: ['hi', ',', 'how', 'are', 'you', 'doing', '?', 'i', "'m", 'fine', '.', 'how', 'about', 'yourself', '?']


### 4. Syntactic Analysis
- deal with grammatical **structure** and **relation** of words

### 5. Semantic Representations

# 3. Training

## 3.1 Markov Chain

In [13]:
from nltk import ngrams
from typing import List

hash_map = {}
# for each sentence in sentences
# generate ngrams

def matching_set(ngrams_model, tokens):
  """
  TODO
  """
  match_grams = []
  count = 0
  for grams in ngrams_model:
    print(f"compare {tokens} with {grams[:len(tokens)]}")

    if grams[:len(tokens)] == tokens:
      match_grams.append(grams)
      count += 1
  return match_grams

def markov_next_word(match_grams):
    """
    TODO
    """
    chosen_word = ""
    p_chosen_word = -1
    # P(next_word | prev_word)
    for candidate in match_grams:
        next_word = candidate[-1]
        if next_word not in hash_map.keys():
            hash_map[next_word] = (1, 1/len(match_grams))
        else:
            hash_map[next_word][0] += 1
            hash_map[next_word][1] = hash_map[next_word][0]/len(match_grams)
        if hash_map[next_word][1] > p_chosen_word:
            chosen_word = next_word
            p_chosen_word = hash_map[next_word][1]
        elif hash_map[next_word][1] > p_chosen_word:
            np.random.choice([chosen_word, next_word])
        print(next_word)
    return (chosen_word, p_chosen_word)

def make_model(batch_sentences: List[str], n: int):
  # making models
  model = []
  for sentence in batch_sentences:
    n_grams = ngrams(nltk.word_tokenize(sentence), n)
    for grams in n_grams:
      model.append(grams)
      print(f"last word should be the next word used for prediction, or target: {grams[n-1]}")
      print(f"prev words {grams[:n-1]}")
  return model


In [14]:
n_grams = 3

# before getting to this stage, must clean up the data set to remove
# punctuation marks
# standardize word, for example I'm --> I am

test_sentences = ["hi how are you doing",
            "hi how the kids"]
model = make_model(test_sentences, n_grams)

last word should be the next word used for prediction, or target: are
prev words ('hi', 'how')
last word should be the next word used for prediction, or target: you
prev words ('how', 'are')
last word should be the next word used for prediction, or target: doing
prev words ('are', 'you')
last word should be the next word used for prediction, or target: the
prev words ('hi', 'how')
last word should be the next word used for prediction, or target: kids
prev words ('how', 'the')


In [15]:
model

[('hi', 'how', 'are'),
 ('how', 'are', 'you'),
 ('are', 'you', 'doing'),
 ('hi', 'how', 'the'),
 ('how', 'the', 'kids')]

In [16]:
input_string = "hello hi how"
match_grams = matching_set(model, tuple(nltk.word_tokenize(input_string))[-(n_grams-1):])

compare ('hi', 'how') with ('hi', 'how')
compare ('hi', 'how') with ('how', 'are')
compare ('hi', 'how') with ('are', 'you')
compare ('hi', 'how') with ('hi', 'how')
compare ('hi', 'how') with ('how', 'the')


In [17]:
match_grams

[('hi', 'how', 'are'), ('hi', 'how', 'the')]

In [18]:
chosen_word, p_chosen_word = markov_next_word(match_grams=match_grams)
hash_map
print("Hash Map", hash_map)
print(f"Predict next word '{chosen_word}' with  {p_chosen_word}")

are
the
Hash Map {'are': (1, 0.5), 'the': (1, 0.5)}
Predict next word 'are' with  0.5


## 3.2 Markov Language Model Evaluation (Perplexity)

## 3.3. LSTM

Note:
+ Pytorch’s LSTM expects all of its inputs to be 3D tensors
+ semantics of the axes of these tensors is important  
-> first axis is the sequence itself,  
-> the second indexes instances in the mini-batch,   
-> the third indexes elements of the input 

#### 3.3.1 Define NN model layers

In [27]:
import torch.nn as nn
import torch
from torch.autograd import Variable
from torch import FloatTensor, randn, zeros

# optimization function
import torch.optim as optim

"""
From Deep Learning for NLP and Speech basic pytorch example
http://seba1511.net/tutorials/intermediate/char_rnn_classification_tutorial.html
"""
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))


#### 3.3.2 Training

In [28]:

dtype = FloatTensor
N, input_size, hidden_size, output_size = 64, 10, 10, 10
data = randn(input_size, hidden_size)



In [29]:
rnn = RNN(input_size=input_size)
# optimizer = optim.Adam(rnn.parameters(), lr=learning_rate, weight_decay=1e-5)

In [32]:

input = Variable(data)
hidden = Variable(zeros(1, rnn.hidden_size))

