In [1]:

# The following application is a simple chatbot, called Friday, which is able to respond to a series of simple
# questions, using as training set a json file containing sample sentences about 19 topics.
# The pre-processing of the sentences ha been done using the nltk module for natural language manipulation:
# the sentences has been first tokenized and then stemmed with the Lancaster stemmer
# The model used is a 2 layer, fully connected dense neural network first the model is trained on the json file data
# and then it is used to classify the sentence coming from the input of the user, and respond accordingly.
## GIACOMO PERONI



In [None]:

!pip install tflearn
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
import tensorflow as tf
import numpy as np
import tflearn
import random
import json

with open('intents.json') as file:
  data = json.load(file)

In [3]:
  ## DATA PREPROCESSING:
  
  words = []
  labels = []
  docs_x = []
  docs_y =[]

  for intent in data['intents']:
    for pattern in intent['patterns']:
      ## the tokenizing process breaks the sentences into single words
      wrds = nltk.word_tokenize(pattern)
      words.extend(wrds)
      docs_x.append(wrds)
      docs_y.append(intent['tag'])

    if intent['tag'] not in labels:
      labels.append(intent['tag'])

  ## the stemming process extracts the root/base form of the worlds e.g goes, gone, go --> go
  words = [stemmer.stem(w.lower()) for w in words if w not in '?']
  words = sorted(list(set(words)))
  ##print(words)

  labels=sorted(labels)

  ## preparing the sets needed for the model
  training = []
  output = []
  out_empty = [0 for _ in range(len(labels))]

  ## one hot encoding: translate every sentence into an array as long as the number of
  ## words and having a 1 if the word at the i's index is contained in the sentence and 0 elsewhere 
  for x,doc in enumerate(docs_x):
      bag= []
    
      wrds = [stemmer.stem(w) for w in doc]
      for w in words:
          if w in wrds:
            bag.append(1)
          else:
            bag.append(0)

##the output simply tells us for every sentence, what category (greetings, farewells, shop ecc...) it belongs to
      output_row=out_empty[:]
      output_row[labels.index(docs_y[x])] =1
      training.append(bag)
      output.append(output_row)

  training = np.array(training)
  output = np.array(output)

## MODEL DEFINITION:  


  ## this defines the model expectation about the shape and size of the input
  net = tflearn.input_data(shape=[None,len(training[0])])
## here we define the layers of the neural network
  net = tflearn.fully_connected(net,8)
  net = tflearn.fully_connected(net,8)
  net = tflearn.fully_connected(net,len(output[0]), activation='softmax')
  net = tflearn.regression(net)

  model = tflearn.DNN(net)

model.fit(training,output, n_epoch=300, batch_size=8 ,show_metric=True)


## one hot encoding
def bag_of_words(s,words):
    bag = [0 for _ in range(len(words))]
    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
      for i,w in enumerate(words):
        if w == se:
          bag[i]=1
    return np.array(bag)

## CHATBOT FUNCTION:
def chat():
    print('Start talking with the bot (type quit to stop)')
    while True:
      inp = input('You:  ')
      if inp.lower() == 'quit':
        break 
      ## PREDICTING THE LABEL OF THE USER'S SENTENCE
      ## remember the square brackets for the bag otherwyse we have dimensional problems
      result = model.predict([bag_of_words(inp,words)])
      result_index = np.argmax(result)
      tag = labels[result_index]

      ## defining a threshold for considering the answer "valid"
      if result[:,result_index] > 0.6:
        
          #formulate an answer
          for t in data['intents']:
            if t['tag'] == tag:
              answers = t['responses']
          print(random.choice(answers))
      else:
          print("sorry, I didn't understand, can you repeat?")

chat()


## The model has some issues like the fact that the model needs to be trained everytime,
## this could be fixed saving the model in the repository and loading it every time, other problems
## regard the limited amount of data disposable and the generality of the answers, aspects
## that can be improved with some solutions that are beyond the scope of this application
      




 




Training Step: 2999  | total loss: [1m[32m0.29318[0m[0m | time: 0.044s
| Adam | epoch: 300 | loss: 0.29318 - acc: 0.9735 -- iter: 72/75
Training Step: 3000  | total loss: [1m[32m0.28420[0m[0m | time: 0.047s
| Adam | epoch: 300 | loss: 0.28420 - acc: 0.9762 -- iter: 75/75
--
Start talking with the bot (type quit to stop)
You:  quit
