## Funciones para que se active el audio y transcriba cuando llamas a Mandy

In [17]:
import speech_recognition as sr
import subprocess
import os
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from word2number import w2n

r = sr.Recognizer()
r.energy_threshold = 2000
mic = sr.Microphone()

# Funciones para recoger el sonido y transformarlo:
def say(text):
    subprocess.call(['say', "-r 180", text])

# obtain audio from the microphone and transcript
def fun_transcript(t=5, p=6, l="es-ES"):
    try:
        with mic as source:
            audio = r.listen(source, timeout=t, phrase_time_limit=p)
            transcript = r.recognize_google(audio, language = l)
            return transcript.lower()
    except:
        pass
    
# activate when you call mandy and return
def activate(phrase='mandy'):
    try:
        transcript = fun_transcript(t=60, p=8)
        if phrase in transcript.lower():
            return [True, transcript.lower()]
        else:
            return activate()
    except:
        return [False, ""]

## Listado de funciones con las funcionalidades de Mandy

In [2]:
# personaliza el nombre de la persona con la que habla
def name_set():
    try:
        say("Hola colega, no nos conocemos, ¿cual es tu nombre?")
        n = fun_transcript(t=3, p=3)
        if n == None:
            say("Estoy sorda, ¿puedes repetirlo?")
            n = fun_transcript(t=4, p=3)
            return n
        else:
            return n
    except:
        pass
    

# Lista de funciones de pandas
# Show you the list of possible csv and you choice one
def load_csv():
    print(os.listdir("./datasets"))
    say("Los dataset disponibles son: " + " ".join(os.listdir("./datasets")) + " Elige el que quieras que abra")
    try:
        transcript = fun_transcript(t=6, p=6)
        while transcript == None:
            say("Estoy sorda, ¿puedes repetirlo?")
            transcript = fun_transcript(t=5, p=6)
        df = pd.read_csv("./datasets/{}.csv".format(transcript))
        print(transcript)
        display(df.head())
        return df
    except:
        pass

def header():
    try:
        display(df.head())
    except:
        pass

def tailer():
    try:
        display(df.tail())
    except:
        pass
    
def shape():
    try:
        print(df.shape)
        say("Este dataset tiene {} columnas y {} filas".format(df.shape[1], df.shape[0]))
    except:
        pass
    
def dftypes():
    try:
        print(df.dtypes)
        say("Aquí tienes un listado de todas las columnas con sus tipos de datos".format(df.shape[1], df.shape[0]))
    except:
        pass

def columnas():
    try:
        display(df.columns)
        say("Las columnas son: {}".format(" ".join(list(df.columns))))
    except:
        pass

def iloca():
    try:
        n = w2n.word_to_num(transcript[1].split(" ")[-1])
        display(df.iloc[n])
    except:
        pass

def loca():
    try:
        column = re.search('columns*(.*)', transcript[1]).group(1).strip().split(" ")[0]
        equal = re.search('equal to(.*)', transcript[1]).group(1).strip().split(" ")[0]
        display(df.loc[column == equal])
    except:
        pass

def isnullo():
    try:
        display(df.isnull().sum()[df.isnull().sum() > 0])
    except:
        pass

def dropnulos():
    try:
        column = re.search('columns*(.*)', transcript[1]).group(1).strip().split(" ")[0]
        return df.dropna(subset=[column], inplace=True)
    except:
        pass

def fillnulos():
    try:
        column = re.search('columns*(.*)', transcript[1]).group(1).strip().split(" ")[0]
        values = ""
        return df[column].fillna(value = values)
    except:
        pass
    
def changetype():
    try:
        column = re.search('columns*(.*)', transcript[1]).group(1).strip().split(" ")[0]
        types = ""
        return df[column].astype(types)
    except:
        pass

def renombrar():
    try:
        old = ""
        new = ""
        return df.rename(columns={old: new})
    except:
        pass

def newindex():
    try:
        column = re.search('columns*(.*)', transcript[1]).group(1).strip().split(" ")[0]
        return df.set_index(column)
    except:
        pass

def descrip():
    # Summary statistics for numerical columns
    try:
        display(df.describe())
    except:
        pass

def media():
    # Returns the mean of all columns
    try:
        display(df.mean())
    except:
        pass
    
def correlacion():
    # Returns the correlation between columns in a DataFrame
    try:
        display(df.corr())
    except:
        pass
    
def counter():
    # Returns the number of non-null values in each DataFrame column
    try:
        display(df.count())
    except:
        pass
    
def maximus():
    # Returns the highest value in each column
    try:
        display(df.max())
    except:
        pass
    
def minimas():
    # Returns the lowest value in each column
    try:
        display(df.min())
    except:
        pass

def mediana():
    # Returns the median of each column
    try:
        display(df.median())
    except:
        pass


## Funciones para entrenar el modelo de machine learning de las ordenes a Mandy

In [3]:
from __future__ import print_function
from functools import reduce
import re
import tarfile
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
from nltk import word_tokenize
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences


def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.

    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [w for w in nltk.word_tokenize(sent.lower()) if not w in stopwords.words("spanish")]


def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format

    If only_supporting is true,
    only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file, retrieve the stories,
    and then convert the sentences into a single story.

    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(y)
    return (pad_sequences(xs, maxlen=story_maxlen),
            pad_sequences(xqs, maxlen=query_maxlen), np.array(ys))

RNN = recurrent.LSTM
EMBED_HIDDEN_SIZE = 50
SENT_HIDDEN_SIZE = 100
QUERY_HIDDEN_SIZE = 100
BATCH_SIZE = 3
EPOCHS = 35
print('RNN / Embed / Sent / Query = {}, {}, {}, {}'.format(RNN,
                                                           EMBED_HIDDEN_SIZE,
                                                           SENT_HIDDEN_SIZE,
                                                           QUERY_HIDDEN_SIZE))

try:
    path = get_file('mandy.tar.gz', origin='')
except:
    raise

challenge = 'keras/mandy_{}.txt'
train = get_stories(open(challenge.format('train')))
test = get_stories(open(challenge.format('test')))

vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

print('Build model...')

sentence = layers.Input(shape=(story_maxlen,), dtype='int32')
encoded_sentence = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(sentence)
encoded_sentence = RNN(SENT_HIDDEN_SIZE)(encoded_sentence)

question = layers.Input(shape=(query_maxlen,), dtype='int32')
encoded_question = layers.Embedding(vocab_size, EMBED_HIDDEN_SIZE)(question)
encoded_question = RNN(QUERY_HIDDEN_SIZE)(encoded_question)

merged = layers.concatenate([encoded_sentence, encoded_question])
preds = layers.Dense(vocab_size, activation='softmax')(merged)

model = Model([sentence, question], preds)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print('Training')
model.fit([x, xq], y,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=0.05)

print('Evaluation')
loss, acc = model.evaluate([tx, txq], ty,
                           batch_size=BATCH_SIZE)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))

dic = {}
df = pd.DataFrame(index = vocab)
for x in range(len(model.predict([tx, txq]))):
    df[x] = model.predict([tx, txq])[x][1:]
    dic[str(x) + " " + test[x][2]] = df[x].idxmax()
dic

Using TensorFlow backend.


RNN / Embed / Sent / Query = <class 'keras.layers.recurrent.LSTM'>, 50, 100, 100
vocab = [',', '.', '?', 'abras', 'abre', 'abreme', 'antiguo', 'aparezca', 'borra', 'busca', 'cabecera', 'cabeza', 'cada', 'cambia', 'cambiame', 'cambiar', 'carga', 'changetypes', 'cinco', 'column', 'columna', 'columnas', 'completa', 'conocer', 'convierte', 'correlacion', 'correlaciones', 'count', 'counter', 'creame', 'csv', 'cuales', 'cuantos', 'cuenta', 'cuentas', 'cuento', 'cuentos', 'dataframe', 'dataset', 'datatime', 'datos', 'describe', 'descrip', 'descripcion', 'df', 'diferentes', 'dime', 'dimension', 'dimensiones', 'dropnulos', 'dtype', 'elimina', 'enseña', 'enseñame', 'establece', 'estadisticas', 'etiqueta', 'fila', 'filas', 'fillnulos', 'final', 'finales', 'float', 'forma', 'hacer', 'haz', 'hazme', 'header', 'igual', 'iguales', 'iloca', 'imprimas', 'imprime', 'incluye', 'index', 'indice', 'iniciales', 'int', 'int64', 'isnullo', 'json', 'lineas', 'llama', 'llena', 'load_csv', 'loca', 'localiza', 'm

{'0 dropnulos': 'dropnulos',
 '1 load_csv': 'load_csv',
 '2 header': 'header',
 '3 tailer': 'tailer',
 '4 shape': 'shape',
 '5 counter': 'counter',
 '6 dropnulos': 'dropnulos',
 '7 shape': 'shape',
 '8 newindex': 'newindex',
 '9 descrip': 'descrip',
 '10 tailer': 'tailer',
 '11 changetypes': 'changetypes',
 '12 correlacion': 'correlacion',
 '13 columnas': 'columnas',
 '14 dtype': 'dtype',
 '15 load_csv': 'load_csv',
 '16 load_csv': 'load_csv',
 '17 tailer': 'tailer',
 '18 shape': 'shape',
 '19 shape': 'shape',
 '20 columnas': 'columnas',
 '21 dtype': 'dtype',
 '22 load_csv': 'load_csv',
 '23 header': 'header',
 '24 tailer': 'tailer',
 '25 load_csv': 'load_csv',
 '26 columnas': 'columnas',
 '27 dtype': 'dtype',
 '28 load_csv': 'load_csv',
 '29 header': 'header',
 '30 tailer': 'tailer',
 '31 load_csv': 'load_csv',
 '32 columnas': 'columnas',
 '33 dtype': 'dtype',
 '34 load_csv': 'load_csv',
 '35 header': 'header',
 '36 tailer': 'tailer',
 '37 shape': 'shape',
 '38 correlacion': 'correlac

## Función que aplica el modelo a la orden dada a Mandy y devuelve la funcionalidad a utilizar

In [4]:
def traineural(tran, vocab = vocab, train = train, test = test):
    test = [(tokenize(tran), ['hacer', 'mandy', '?'], '.')]
    tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)
    df = pd.DataFrame(index = vocab)
    df["Prediction"] = model.predict([tx, txq])[0][1:]
    return df["Prediction"].idxmax()


## Función activadora

In [10]:
# Llama a la función correspondiente en función de tus palabras:
import re
from unicodedata import normalize

def pd_fun(trans):
    trans = normalize('NFC', re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize("NFD", trans), 0, re.I))
    dic = {
        load_csv:"load_csv",
        header:"header",
        tailer:"tailer",
        shape:"shape",
        dftypes:"dftypes",
        columnas:"columnas",
        iloca:"iloca",
        loca:"loca",
        isnullo:"isnullo",
        dropnulos:"dropnulos",
        fillnulos:"fillnulos",
        changetype:"changetype",
        renombrar:"renombrar",
        newindex:"newindex",
        descrip:"descrip",
        media:"media",
        correlacion:"correlacion",
        counter:"counter",
        maximus:"maximus",
        minimas:"minimas",
        mediana:"mediana",
    }
    print("F: " + traineural(trans))
    for x in dic:
        if dic[x] == traineural(trans):
            return x
        else:
            pass

# Main ( Enciende a Mandy )

In [16]:
# Código principal
variable = True
#name = name_set()
#say("Hola {}, que puedo hacer por ti?. Di mi nombre y lo que quieres que haga para que me active.".format(name))

while variable:
    print("Habla")
    try:
        transcript = activate()
        if transcript[0] == True:
            try:
                print("T: " + transcript[1])
                if "stop" in transcript[1]:
                    variable = False
                else:
                    if pd_fun(transcript[1]) == load_csv:
                        df = load_csv()
                    else:
                        pd_fun(transcript[1])()
                    
            except sr.UnknownValueError:
                say("Vocaliza un poquito, que no hay quien te entienda")
            except sr.RequestError as e:
                print("Could not request results from Mandy service; {0}".format(e))
            
        else:
            print("Espera")
            continue
    
    except:
        pass

Habla
Espera
Habla
Espera
Habla
Espera
Habla
Espera
Habla
Espera
Habla
Espera
Habla
Espera
Habla
Espera
Habla
T: mandy
F: counter
F: counter


page_id             16376
name                16376
urlslug             16376
ID                  12606
ALIGN               13564
EYE                  6609
HAIR                12112
SEX                 15522
GSM                    90
ALIVE               16373
APPEARANCES         15280
FIRST APPEARANCE    15561
Year                15561
dtype: int64

Habla
Espera
Habla
T: mandy stop mandy
