### Chatbot

In [21]:
import random 

import joblib

import sys
import os


import random
random.seed(42)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve

import matplotlib.pyplot as plt

import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, Sequential
from tensorflow.keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, concatenate, Dropout
from tensorflow.keras.models import Model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [22]:
df = pd.read_excel(r"training_chatbot.xlsx")
df.head()

Unnamed: 0,Intent type,Sentence
0,Greeting,Hi!
1,Greeting,Hello
2,Greeting,How are you?
3,Greeting,"Hi, how is it going?"
4,Greeting,Greetings!


In [23]:
# Usar get_dummies en vez de las 3 funciones
df_oh = pd.concat([df, pd.get_dummies(df['Intent type'])], axis = 1)

In [24]:
df_oh.head()

Unnamed: 0,Intent type,Sentence,Greeting,Search,Suggestions
0,Greeting,Hi!,1,0,0
1,Greeting,Hello,1,0,0
2,Greeting,How are you?,1,0,0
3,Greeting,"Hi, how is it going?",1,0,0
4,Greeting,Greetings!,1,0,0


In [25]:
df_oh.dtypes

Intent type    object
Sentence       object
Greeting        uint8
Search          uint8
Suggestions     uint8
dtype: object

In [26]:
df_train, df_test = train_test_split(df_oh, train_size = 0.7, test_size = 0.3, random_state = 42,
                                    shuffle = True, stratify = df_oh['Intent type'])

In [27]:
df_train['Intent type'].value_counts()

Greeting       21
Search         20
Suggestions    16
Name: Intent type, dtype: int64

In [28]:
df_test['Intent type'].value_counts()

Search         9
Greeting       9
Suggestions    7
Name: Intent type, dtype: int64

In [29]:
def processing(df, pretreatment = False, Tfidf = True, cv = None, stopwords = []):
  # Normalizamos y limpiamos el corpus 
  if pretreatment == True:
    df["Sentence"] = df['Sentence'].apply(lambda x: word_treatment(x))
    print("El corpus ha sido pretratado")
    
     # Transformamos nuestro corpus a un vector Tfidf
  if Tfidf == True:

    if cv == None:
      cv = TfidfVectorizer(
        stop_words= stopwords,
        ngram_range=(1, 4),
        strip_accents='ascii',
        max_df=0.99,
        min_df=0,
        max_features=100
      )
      cv.fit(df["Sentence"])
      X = cv.transform(df["Sentence"])
      print("Se ha realizado una vectorización Tfidf")
      return df, X, cv

    else:
      X = cv.transform(df["Sentence"])
      print("Se ha realizado una vectorización Tfidf basado en el corpus suministrado por cv")
    return df, X
  else:
    return df


In [30]:
df_train, X_train, cv = processing(df_train)
df_test, X_test = processing(df_test, cv = cv)

Se ha realizado una vectorización Tfidf
Se ha realizado una vectorización Tfidf basado en el corpus suministrado por cv


In [31]:
y_train = df_train[["Greeting","Search","Suggestions"]]
y_test = df_test[["Greeting","Search","Suggestions"]]

In [32]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, train_size=0.7, 
test_size=0.3, random_state=42, shuffle=True, stratify=df_train["Intent type"])

In [33]:
shape= X_train.shape[1]

X_train.sort_indices()
X_validation.sort_indices()

In [34]:
def mlp_greeting(shape):
# define our MLP network
    initializer = tf.keras.initializers.RandomUniform(minval=-0.5, maxval=0.5, seed=42)
    model = Sequential()
    model.add(Dense(16, input_dim=shape, kernel_initializer = initializer, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(4, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation="relu"))
# check to see if the regression node should be added
    #if regress:
    model.add(Dense(1, activation="sigmoid"))
    #Compile model
    opt = tf.keras.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', metrics = ["accuracy"], optimizer=opt)
# return our model
    return model

In [35]:
def mlp_search(shape):
# define our MLP network
    initializer = tf.keras.initializers.RandomUniform(minval=-0.5, maxval=0.5, seed=42)
    model = Sequential()
    model.add(Dense(16, input_dim=shape, kernel_initializer = initializer, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(4, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation="relu"))
# check to see if the regression node should be added
    #if regress:
    model.add(Dense(1, activation="sigmoid"))
    #Compile model
    opt = tf.keras.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', metrics = ["accuracy"], optimizer=opt)
# return our model
    return model

In [36]:
def mlp_suggestion(shape):
# define our MLP network
    initializer = tf.keras.initializers.RandomUniform(minval=-0.5, maxval=0.5, seed=42)
    model = Sequential()
    model.add(Dense(16, input_dim=shape, kernel_initializer = initializer, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(8, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(4, activation="relu"))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation="relu"))
# check to see if the regression node should be added
    #if regress:
    model.add(Dense(1, activation="sigmoid"))
    #Compile model
    opt = tf.keras.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', metrics = ["accuracy"], optimizer = opt)
# return our model
    return model

In [37]:
mlp_greeting = mlp_greeting(shape)
history = mlp_greeting.fit(X_train, np.asarray(y_train["Greeting"]).reshape(-1,1),
                  validation_data=(X_validation, np.asarray(y_validation["Greeting"]).reshape(-1,1)),
    epochs=200,
    workers = 2, use_multiprocessing= True, verbose = 2)

Train on 39 samples, validate on 18 samples
Epoch 1/200
 - 4s - loss: 0.6925 - acc: 0.6410 - val_loss: 0.6898 - val_acc: 0.6111
Epoch 2/200
 - 0s - loss: 0.6897 - acc: 0.6410 - val_loss: 0.6894 - val_acc: 0.6111
Epoch 3/200
 - 0s - loss: 0.6941 - acc: 0.6410 - val_loss: 0.6891 - val_acc: 0.6111
Epoch 4/200
 - 0s - loss: 0.6857 - acc: 0.6410 - val_loss: 0.6887 - val_acc: 0.6111
Epoch 5/200
 - 0s - loss: 0.6881 - acc: 0.6410 - val_loss: 0.6882 - val_acc: 0.6111
Epoch 6/200
 - 0s - loss: 0.6852 - acc: 0.6410 - val_loss: 0.6877 - val_acc: 0.6111
Epoch 7/200
 - 0s - loss: 0.6892 - acc: 0.6410 - val_loss: 0.6871 - val_acc: 0.6111
Epoch 8/200
 - 0s - loss: 0.6867 - acc: 0.6410 - val_loss: 0.6865 - val_acc: 0.6111
Epoch 9/200
 - 0s - loss: 0.6851 - acc: 0.6410 - val_loss: 0.6858 - val_acc: 0.6111
Epoch 10/200
 - 0s - loss: 0.6866 - acc: 0.6410 - val_loss: 0.6851 - val_acc: 0.6111
Epoch 11/200
 - 0s - loss: 0.6824 - acc: 0.6410 - val_loss: 0.6844 - val_acc: 0.6111
Epoch 12/200
 - 0s - loss: 0.6

Epoch 97/200
 - 0s - loss: 0.5034 - acc: 0.6410 - val_loss: 0.5064 - val_acc: 0.6111
Epoch 98/200
 - 0s - loss: 0.4312 - acc: 0.6410 - val_loss: 0.5032 - val_acc: 0.6111
Epoch 99/200
 - 0s - loss: 0.4164 - acc: 0.6410 - val_loss: 0.5000 - val_acc: 0.6111
Epoch 100/200
 - 0s - loss: 0.4298 - acc: 0.6410 - val_loss: 0.4968 - val_acc: 0.6111
Epoch 101/200
 - 0s - loss: 0.4413 - acc: 0.6410 - val_loss: 0.4936 - val_acc: 0.6111
Epoch 102/200
 - 0s - loss: 0.3654 - acc: 0.6410 - val_loss: 0.4904 - val_acc: 0.6111
Epoch 103/200
 - 0s - loss: 0.4457 - acc: 0.6410 - val_loss: 0.4872 - val_acc: 0.6111
Epoch 104/200
 - 0s - loss: 0.3893 - acc: 0.6410 - val_loss: 0.4841 - val_acc: 0.6111
Epoch 105/200
 - 0s - loss: 0.4215 - acc: 0.6410 - val_loss: 0.4809 - val_acc: 0.6111
Epoch 106/200
 - 0s - loss: 0.3930 - acc: 0.6410 - val_loss: 0.4778 - val_acc: 0.6111
Epoch 107/200
 - 0s - loss: 0.4625 - acc: 0.6410 - val_loss: 0.4748 - val_acc: 0.6111
Epoch 108/200
 - 0s - loss: 0.4132 - acc: 0.6410 - val_lo

Epoch 193/200
 - 0s - loss: 0.3440 - acc: 0.8718 - val_loss: 0.3188 - val_acc: 0.8333
Epoch 194/200
 - 0s - loss: 0.2678 - acc: 0.9744 - val_loss: 0.3174 - val_acc: 0.8333
Epoch 195/200
 - 0s - loss: 0.2603 - acc: 0.9744 - val_loss: 0.3162 - val_acc: 0.8333
Epoch 196/200
 - 0s - loss: 0.2812 - acc: 0.9744 - val_loss: 0.3152 - val_acc: 0.8333
Epoch 197/200
 - 0s - loss: 0.2732 - acc: 0.9744 - val_loss: 0.3145 - val_acc: 0.8333
Epoch 198/200
 - 0s - loss: 0.2769 - acc: 0.9744 - val_loss: 0.3140 - val_acc: 0.8333
Epoch 199/200
 - 0s - loss: 0.2789 - acc: 0.9231 - val_loss: 0.3133 - val_acc: 0.8333
Epoch 200/200
 - 0s - loss: 0.2712 - acc: 0.9487 - val_loss: 0.3125 - val_acc: 0.9444


In [38]:
mlp_search = mlp_search(shape)
history = mlp_search.fit(X_train, np.asarray(y_train["Search"]).reshape(-1,1),
                  validation_data=(X_validation, np.asarray(y_validation["Search"]).reshape(-1,1)),
    epochs=250,
    workers = 2, use_multiprocessing= True, verbose = 2)

Train on 39 samples, validate on 18 samples
Epoch 1/250
 - 1s - loss: 0.6934 - acc: 0.5641 - val_loss: 0.6935 - val_acc: 0.3889
Epoch 2/250
 - 0s - loss: 0.6932 - acc: 0.5385 - val_loss: 0.6930 - val_acc: 0.6111
Epoch 3/250
 - 0s - loss: 0.6929 - acc: 0.5128 - val_loss: 0.6926 - val_acc: 0.6111
Epoch 4/250
 - 0s - loss: 0.6927 - acc: 0.6154 - val_loss: 0.6922 - val_acc: 0.6667
Epoch 5/250
 - 0s - loss: 0.6919 - acc: 0.6410 - val_loss: 0.6917 - val_acc: 0.6667
Epoch 6/250
 - 0s - loss: 0.6915 - acc: 0.6410 - val_loss: 0.6913 - val_acc: 0.6667
Epoch 7/250
 - 0s - loss: 0.6917 - acc: 0.6410 - val_loss: 0.6909 - val_acc: 0.6667
Epoch 8/250
 - 0s - loss: 0.6917 - acc: 0.6410 - val_loss: 0.6905 - val_acc: 0.6667
Epoch 9/250
 - 0s - loss: 0.6901 - acc: 0.6410 - val_loss: 0.6901 - val_acc: 0.6667
Epoch 10/250
 - 0s - loss: 0.6907 - acc: 0.6410 - val_loss: 0.6897 - val_acc: 0.6667
Epoch 11/250
 - 0s - loss: 0.6900 - acc: 0.6410 - val_loss: 0.6893 - val_acc: 0.6667
Epoch 12/250
 - 0s - loss: 0.6

Epoch 97/250
 - 0s - loss: 0.6119 - acc: 0.6410 - val_loss: 0.6112 - val_acc: 0.6667
Epoch 98/250
 - 0s - loss: 0.6176 - acc: 0.6410 - val_loss: 0.6082 - val_acc: 0.6667
Epoch 99/250
 - 0s - loss: 0.6066 - acc: 0.6410 - val_loss: 0.6048 - val_acc: 0.6667
Epoch 100/250
 - 0s - loss: 0.6004 - acc: 0.6410 - val_loss: 0.6014 - val_acc: 0.6667
Epoch 101/250
 - 0s - loss: 0.5996 - acc: 0.6410 - val_loss: 0.5978 - val_acc: 0.6667
Epoch 102/250
 - 0s - loss: 0.5875 - acc: 0.6410 - val_loss: 0.5941 - val_acc: 0.6667
Epoch 103/250
 - 0s - loss: 0.5927 - acc: 0.6410 - val_loss: 0.5902 - val_acc: 0.6667
Epoch 104/250
 - 0s - loss: 0.6113 - acc: 0.6410 - val_loss: 0.5862 - val_acc: 0.6667
Epoch 105/250
 - 0s - loss: 0.5811 - acc: 0.6410 - val_loss: 0.5822 - val_acc: 0.6667
Epoch 106/250
 - 0s - loss: 0.5518 - acc: 0.6410 - val_loss: 0.5782 - val_acc: 0.6667
Epoch 107/250
 - 0s - loss: 0.5939 - acc: 0.6410 - val_loss: 0.5741 - val_acc: 0.6667
Epoch 108/250
 - 0s - loss: 0.5907 - acc: 0.6410 - val_lo

Epoch 193/250
 - 0s - loss: 0.3835 - acc: 0.6410 - val_loss: 0.3239 - val_acc: 0.6667
Epoch 194/250
 - 0s - loss: 0.2949 - acc: 0.6410 - val_loss: 0.3236 - val_acc: 0.6667
Epoch 195/250
 - 0s - loss: 0.3483 - acc: 0.6410 - val_loss: 0.3232 - val_acc: 0.6667
Epoch 196/250
 - 0s - loss: 0.3363 - acc: 0.6410 - val_loss: 0.3227 - val_acc: 0.6667
Epoch 197/250
 - 0s - loss: 0.3620 - acc: 0.6410 - val_loss: 0.3223 - val_acc: 0.6667
Epoch 198/250
 - 0s - loss: 0.3058 - acc: 0.6410 - val_loss: 0.3218 - val_acc: 0.6667
Epoch 199/250
 - 0s - loss: 0.3017 - acc: 0.6410 - val_loss: 0.3211 - val_acc: 0.6667
Epoch 200/250
 - 0s - loss: 0.3350 - acc: 0.6410 - val_loss: 0.3204 - val_acc: 0.6667
Epoch 201/250
 - 0s - loss: 0.2989 - acc: 0.6410 - val_loss: 0.3200 - val_acc: 0.6667
Epoch 202/250
 - 0s - loss: 0.3013 - acc: 0.6410 - val_loss: 0.3194 - val_acc: 0.6667
Epoch 203/250
 - 0s - loss: 0.3618 - acc: 0.6410 - val_loss: 0.3189 - val_acc: 0.6667
Epoch 204/250
 - 0s - loss: 0.3141 - acc: 0.6410 - val

In [39]:
mlp_suggestion = mlp_suggestion(shape)
history = mlp_suggestion.fit(X_train, np.asarray(y_train["Search"]).reshape(-1,1),
                  validation_data=(X_validation, np.asarray(y_validation["Search"]).reshape(-1,1)),
    epochs=250,
    workers = 2, use_multiprocessing= True, verbose = 2)

Train on 39 samples, validate on 18 samples
Epoch 1/250
 - 1s - loss: 0.7540 - acc: 0.3590 - val_loss: 0.6929 - val_acc: 0.3889
Epoch 2/250
 - 0s - loss: 0.6813 - acc: 0.5897 - val_loss: 0.6908 - val_acc: 0.3889
Epoch 3/250
 - 0s - loss: 0.7129 - acc: 0.4359 - val_loss: 0.6886 - val_acc: 0.4444
Epoch 4/250
 - 0s - loss: 0.7170 - acc: 0.5897 - val_loss: 0.6861 - val_acc: 0.5556
Epoch 5/250
 - 0s - loss: 0.7445 - acc: 0.5641 - val_loss: 0.6831 - val_acc: 0.5556
Epoch 6/250
 - 0s - loss: 0.7275 - acc: 0.4359 - val_loss: 0.6805 - val_acc: 0.5000
Epoch 7/250
 - 0s - loss: 0.6892 - acc: 0.5897 - val_loss: 0.6783 - val_acc: 0.6111
Epoch 8/250
 - 0s - loss: 0.6708 - acc: 0.5641 - val_loss: 0.6769 - val_acc: 0.6111
Epoch 9/250
 - 0s - loss: 0.7383 - acc: 0.4872 - val_loss: 0.6757 - val_acc: 0.6111
Epoch 10/250
 - 0s - loss: 0.6780 - acc: 0.5897 - val_loss: 0.6747 - val_acc: 0.6111
Epoch 11/250
 - 0s - loss: 0.6636 - acc: 0.6667 - val_loss: 0.6736 - val_acc: 0.7222
Epoch 12/250
 - 0s - loss: 0.7

Epoch 97/250
 - 0s - loss: 0.5858 - acc: 0.8205 - val_loss: 0.6324 - val_acc: 0.7778
Epoch 98/250
 - 0s - loss: 0.5854 - acc: 0.7692 - val_loss: 0.6313 - val_acc: 0.7778
Epoch 99/250
 - 0s - loss: 0.6001 - acc: 0.7692 - val_loss: 0.6302 - val_acc: 0.7778
Epoch 100/250
 - 0s - loss: 0.5818 - acc: 0.7949 - val_loss: 0.6290 - val_acc: 0.7778
Epoch 101/250
 - 0s - loss: 0.6353 - acc: 0.6667 - val_loss: 0.6279 - val_acc: 0.7778
Epoch 102/250
 - 0s - loss: 0.6229 - acc: 0.7179 - val_loss: 0.6269 - val_acc: 0.7778
Epoch 103/250
 - 0s - loss: 0.6149 - acc: 0.7436 - val_loss: 0.6259 - val_acc: 0.7778
Epoch 104/250
 - 0s - loss: 0.5663 - acc: 0.8718 - val_loss: 0.6250 - val_acc: 0.7778
Epoch 105/250
 - 0s - loss: 0.6188 - acc: 0.7692 - val_loss: 0.6241 - val_acc: 0.7778
Epoch 106/250
 - 0s - loss: 0.6143 - acc: 0.7949 - val_loss: 0.6232 - val_acc: 0.7778
Epoch 107/250
 - 0s - loss: 0.5754 - acc: 0.8205 - val_loss: 0.6223 - val_acc: 0.7778
Epoch 108/250
 - 0s - loss: 0.5690 - acc: 0.8462 - val_lo

Epoch 193/250
 - 0s - loss: 0.4440 - acc: 0.9231 - val_loss: 0.5612 - val_acc: 0.8333
Epoch 194/250
 - 0s - loss: 0.4635 - acc: 0.9231 - val_loss: 0.5609 - val_acc: 0.8333
Epoch 195/250
 - 0s - loss: 0.4772 - acc: 0.9487 - val_loss: 0.5609 - val_acc: 0.8889
Epoch 196/250
 - 0s - loss: 0.4807 - acc: 0.8974 - val_loss: 0.5608 - val_acc: 0.8889
Epoch 197/250
 - 0s - loss: 0.4644 - acc: 0.9231 - val_loss: 0.5608 - val_acc: 0.8889
Epoch 198/250
 - 0s - loss: 0.4472 - acc: 0.9487 - val_loss: 0.5609 - val_acc: 0.8889
Epoch 199/250
 - 0s - loss: 0.4576 - acc: 0.8974 - val_loss: 0.5610 - val_acc: 0.8889
Epoch 200/250
 - 0s - loss: 0.4289 - acc: 0.9487 - val_loss: 0.5612 - val_acc: 0.8889
Epoch 201/250
 - 0s - loss: 0.5139 - acc: 0.7949 - val_loss: 0.5612 - val_acc: 0.8889
Epoch 202/250
 - 0s - loss: 0.4550 - acc: 0.9231 - val_loss: 0.5616 - val_acc: 0.8889
Epoch 203/250
 - 0s - loss: 0.4827 - acc: 0.8718 - val_loss: 0.5619 - val_acc: 0.8889
Epoch 204/250
 - 0s - loss: 0.4431 - acc: 0.8974 - val

# CHATBOT TEST

In [47]:
def chatbot():
    
    input_text = input()
    
    test = pd.DataFrame(data = {'Sentence': [input_text]})
    df_test_proc, test_proc = processing(test, cv = cv)

    gret_prob = mlp_greeting.predict_proba(test_proc)
    search_prob = mlp_search.predict_proba(test_proc)
    sugg_prob = mlp_suggestion.predict_proba(test_proc)
    
    probs = [gret_prob, search_prob, sugg_prob]
    idx = np.argmax(probs)
    
    if idx == 0:
        print("Esto es un saludo")
    elif idx == 1:
        print("Esto es una búsqueda")
    else:
        print("Esto es una sugerencia")
        
    print('¿Hemos acertado?')
    
    respuesta = input()
    if (respuesta == 'No' or respuesta == 'no'):
        probs = np.delete(probs, idx)
        idx_2 = np.argmax(probs)
        
        if idx == 0:
            if idx_2 == 0:
                print("Esto es una búsqueda")
            else:
                print("Esto es una sugerencia")
        elif idx == 1:
            if idx_2 == 0:
                print("Esto es un saludo")
            else:
                print("Esto es una sugerencia")
        else:
            if idx_2 == 0:
                print("Esto es un saludo")
            else:
                print("Esto es una búsqueda")
    else:
        print('¡Genial! ¡Hemos acertado!')
    

In [48]:
chatbot()

Hi!
Se ha realizado una vectorización Tfidf basado en el corpus suministrado por cv
Esto es un saludo
¿Hemos acertado?
No
Esto es una sugerencia
