<a href="https://colab.research.google.com/github/mr-alamdari/NLP-ChatBots/blob/main/NLP_ChatBots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

In [6]:
!wget https://raw.githubusercontent.com/mr-alamdari/NLP-ChatBots/main/train_qa.txt

--2022-05-02 06:13:25--  https://raw.githubusercontent.com/mr-alamdari/NLP-ChatBots/main/train_qa.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3804342 (3.6M) [application/octet-stream]
Saving to: ‘train_qa.txt’


2022-05-02 06:13:26 (51.9 MB/s) - ‘train_qa.txt’ saved [3804342/3804342]



In [5]:
!wget https://raw.githubusercontent.com/mr-alamdari/NLP-ChatBots/main/test_qa.txt

--2022-05-02 06:12:59--  https://raw.githubusercontent.com/mr-alamdari/NLP-ChatBots/main/test_qa.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 377233 (368K) [application/octet-stream]
Saving to: ‘test_qa.txt’


2022-05-02 06:12:59 (10.7 MB/s) - ‘test_qa.txt’ saved [377233/377233]



In [7]:
with open('train_qa.txt', 'rb') as f:
  train_data = pickle.load(f)

In [8]:
with open('test_qa.txt', 'rb') as f:
  test_data = pickle.load(f)

In [10]:
len(train_data)

10000

In [11]:
len(test_data)

1000

In [19]:
' '.join(train_data[10][0])

'Sandra went back to the hallway . Sandra moved to the office .'

In [20]:
' '.join(train_data[10][1])

'Is Sandra in the office ?'

In [22]:
train_data[10][2]

'yes'

In [24]:
all_data = train_data + train_data

In [29]:
vocab = set(['yes', 'no'])
for story, question, answer in all_data:
  vocab = vocab.union(set(story))
  vocab = vocab.union(set(question))

In [33]:
vocab_len = len(vocab) + 1

In [34]:
vocab_len

38

In [36]:
all_story_length = [len(data[0]) for data in all_data]

In [37]:
max_story_len = max(all_story_length)

In [38]:
max_story_len

156

In [39]:
all_question_length = [len(data[1]) for data in all_data]

In [40]:
max_question_length = max(all_question_length)

In [41]:
max_question_length

6

In [44]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(vocab)

In [45]:
tokenizer.word_index

{'apple': 1,
 'back': 22,
 'bathroom': 35,
 'bedroom': 14,
 'daniel': 24,
 'discarded': 28,
 'down': 9,
 'dropped': 11,
 'football': 16,
 'garden': 19,
 'got': 2,
 'grabbed': 4,
 'hallway': 6,
 'in': 26,
 'is': 30,
 'john': 33,
 'journeyed': 10,
 'kitchen': 31,
 'left': 27,
 'mary': 18,
 'milk': 8,
 'moved': 29,
 'no': 7,
 'office': 32,
 'picked': 34,
 'put': 3,
 'sandra': 5,
 'the': 12,
 'there': 15,
 'to': 20,
 'took': 13,
 'travelled': 25,
 'up': 17,
 'went': 23,
 'yes': 21}

In [46]:
train_story_text = []
train_question_text = []
train_answers = []

In [47]:
for story, question, answer in train_data:
  train_story_text.append(story)
  train_question_text.append(question)
  train_answers.append(answer)

In [49]:
train_story_text[20]

['Daniel',
 'got',
 'the',
 'apple',
 'there',
 '.',
 'John',
 'picked',
 'up',
 'the',
 'football',
 'there',
 '.',
 'Daniel',
 'left',
 'the',
 'apple',
 '.',
 'Daniel',
 'moved',
 'to',
 'the',
 'kitchen',
 '.']

In [50]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [51]:
train_story_seq[20]

[24, 2, 12, 1, 15, 33, 34, 17, 12, 16, 15, 24, 27, 12, 1, 24, 29, 20, 12, 31]

In [52]:
len(train_story_seq), len(train_story_text)

(10000, 10000)

In [68]:
def vectorize_texts(data, max_story_len=max_story_len, max_question_len=max_question_length, word_index=tokenizer.word_index, vocab_len=vocab_len):
  stories = []
  questions = []
  answers = []

  for story, question, answer in data:
    s = [word_index[word.lower()] for word in story if word.isalpha()]
    q = [word_index[word.lower()] for word in question if word.isalpha()]
    y = np.zeros(vocab_len)
    y[word_index[answer]] = 1

    stories.append(s)
    questions.append(q)
    answers.append(y)
  
  return (tf.keras.preprocessing.sequence.pad_sequences(stories, maxlen=max_story_len),
          tf.keras.preprocessing.sequence.pad_sequences(questions, maxlen=max_question_length),
          np.array(answers))

In [69]:
data = train_data
inputs_train, questions_train, answers_train = vectorize_texts(data)

In [70]:
data = test_data
inputs_test, questions_test, answers_test = vectorize_texts(data)

In [71]:
inputs_train

array([[ 0,  0,  0, ..., 20, 12, 14],
       [ 0,  0,  0, ..., 20, 12,  6],
       [ 0,  0,  0, ..., 20, 12, 35],
       ...,
       [ 0,  0,  0, ..., 20, 12, 14],
       [ 0,  0,  0, ..., 12,  8, 15],
       [ 0,  0,  0, ..., 12,  1, 15]], dtype=int32)

In [72]:
questions_train

array([[ 0, 30,  5, 26, 12,  6],
       [ 0, 30, 24, 26, 12, 35],
       [ 0, 30, 24, 26, 12, 32],
       ...,
       [ 0, 30,  5, 26, 12,  6],
       [ 0, 30, 18, 26, 12, 31],
       [ 0, 30, 18, 26, 12, 14]], dtype=int32)

In [73]:
answers_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [75]:
len(answers_test)

1000

In [76]:
sum(answers_test)

array([  0.,   0.,   0.,   0.,   0.,   0.,   0., 503.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 497.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.])