In [10]:
import os, time, re

from audio_utils import AudioRecorder, SpeechRecognizer, TextToAudioConverter

%load_ext autoreload
%autoreload 2

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


# Machine Learning Section

In [14]:
import pandas as pd
import re, os
import numpy as np
import transformers
from transformers.modeling_tf_distilbert import TFDistilBertModel
from tokenizers import BertWordPieceTokenizer
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("data.csv", delimiter=",")
label = data["label1"]
data

Unnamed: 0,Question,label1,label2
0,How to run RPA?,1,rpa
1,Could you run RPA?,1,rpa
2,Call RPA please?,1,rpa
3,Write email?,1,email
4,Could you send email?,1,email
5,Could you book a hotel for me?,1,hotel
6,Could you run RPA?,1,rpa
7,I would like to listen some music,1,music
8,Could you play some music?,1,music
9,Open calendar please,0,calendar


In [3]:
def clean(text):
    from nltk.corpus import stopwords

    text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.str.replace("?","").str.replace("'"," ").str.lower()
    # Unimportant part
    text = text.map(lambda x: re.sub("(could|please|would|may)",'',str(x)))
    text = text.map(lambda x: re.sub(r"\b("+r"|".join(stopwords.words("english"))+r")\b",'',str(x)))
    return text

In [4]:
data["Question"]=clean(data["Question"])
data

Unnamed: 0,Question,label1,label2
0,run rpa,1,rpa
1,run rpa,1,rpa
2,call rpa,1,rpa
3,write email,1,email
4,send email,1,email
5,book hotel,1,hotel
6,run rpa,1,rpa
7,like listen music,1,music
8,play music,1,music
9,open calendar,0,calendar


In [8]:
def fast_encode(texts, tokenizer, chunk_size=240, maxlen=512):    
    tokenizer.enable_truncation(max_length=maxlen)
    try:
        tokenizer.enable_padding(max_length=maxlen)
    except TypeError:
        tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    
    return np.array(all_ids)

In [9]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)


fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', 
                                        lowercase=True)

In [10]:
max_len=30
x_train = fast_encode(data["Question"].astype(str), 
                      fast_tokenizer, maxlen=max_len)

# Train part one

In [12]:
def build_vnn_model(transformer, max_len, num_classes=2):
    
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    
    embed = transformer.weights[0].numpy()
    embedding = tf.keras.layers.Embedding(np.shape(embed)[0], np.shape(embed)[1],
                          input_length=max_len, weights=[embed],
                          trainable=False)(input_word_ids)
#     print(embedding.shape)
    conc = K.sum(embedding, axis=2)
    conc = tf.keras.layers.Dense(128, activation='relu')(conc)
#     conc = tf.keras.layers.Dense(256, activation='relu')(conc)
    conc = tf.keras.layers.Dense(64, activation='relu')(conc)
    
    conc = tf.keras.layers.Dense(num_classes, activation='softmax')(conc)
    loss = "categorical_crossentropy"
        
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=conc)
    
    model.compile(tf.keras.optimizers.Adam(lr=0.0007), 
                  loss=loss, 
                  metrics=['accuracy'])
    
    return model

In [15]:
transformer_layer = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
model_vnn = build_vnn_model(transformer_layer, max_len=max_len)
model_vnn.summary()

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 30)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 768)           91812096  
_________________________________________________________________
tf_op_layer_Sum (TensorFlowO [(None, 30)]              0         
_________________________________________________________________
dense (Dense)                (None, 128)               3968      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 91,824,450
Trainable params: 12,354
Non-trainable params: 91,812,096
_____________________________________

In [599]:
y_encoded = data["label1"]
label = tf.keras.utils.to_categorical(y_encoded)
model_vnn.fit(x_train, label, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff1e63b1dd8>

In [604]:
text =["could you write an email please",
      "could you run rpa",
      "could you open the window please",
      "hi there how are you?"]
text = clean(pd.Series(text))
test = fast_encode(text, fast_tokenizer, maxlen=max_len)
y_pred = model_vnn.predict(test)
print(y_pred, y_pred.argmax(axis=1))

[[0.32581374 0.6741862 ]
 [0.03763636 0.9623636 ]
 [0.5370296  0.4629704 ]
 [0.9583503  0.04164964]] [1 1 0 0]


In [605]:
text

0      write  email 
1            run rpa
2      open  window 
3             hi    
dtype: object

In [601]:
model_vnn.save("modelDNN1.h5")

In [602]:
model1 = tf.keras.models.load_model("modelDNN1.h5")

text =["could you write an email please",
      "could you run rpa",
      "could you open the window please"]
text = clean(pd.Series(text))
test = fast_encode(text, fast_tokenizer, maxlen=max_len)
y_pred = model1.predict(test)
print(y_pred, y_pred.argmax(axis=1))

[[0.32581392 0.67418605]
 [0.03763631 0.9623637 ]
 [0.53702915 0.46297082]] [1 1 0]


# Train Part two

In [504]:
labelencoder2 = LabelEncoder()

x_train2 = x_train[data["label1"]==1]
label = data[data["label1"]==1]["label2"]
y_encoded = labelencoder2.fit_transform(label)
model_vnn2 = build_vnn_model(transformer_layer, max_len=max_len, num_classes=
                             len(np.unique(y_encoded)))
label = tf.keras.utils.to_categorical(y_encoded)
model_vnn2.fit(x_train2, label, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff34fd65400>

In [507]:
text =["could you send an email please",
      "run rpa"] 
text = clean(pd.Series(text))
test = fast_encode(text, fast_tokenizer, maxlen=max_len)
y_pred = model_vnn2.predict(test)
labelencoder2.inverse_transform(y_pred.argmax(axis=1))

array(['email', 'rpa'], dtype=object)

In [550]:
model_vnn2.save("modelDNN2.h5")

In [553]:
import joblib
joblib.dump(labelencoder2, "labelencoder.pkl")

['labelencoder.pkl']

In [556]:
labelencoder_n = joblib.load("labelencoder.pkl")

In [555]:
labelencoder

LabelEncoder()

In [557]:
model2 = tf.keras.models.load_model("modelDNN2.h5")


text =["could you send an email please",
      "run rpa"] 
text = clean(pd.Series(text))
test = fast_encode(text, fast_tokenizer, maxlen=max_len)
y_pred = model2.predict(test)
labelencoder_n.inverse_transform(y_pred.argmax(axis=1))



array(['email', 'rpa'], dtype=object)