In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train_data.csv")
train = train.drop(['path', 'action', 'location'], axis = 1)
train.head()

Unnamed: 0,transcription,object
0,Turn on the kitchen lights,lights
1,Turn up the temperature,heat
2,OK now switch the main language to Chinese,Chinese
3,Turn down the bathroom temperature,heat
4,Change the language,none


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

token = Tokenizer()
token.fit_on_texts(train['transcription'])
seq = token.texts_to_sequences(train['transcription'])

In [4]:
train['tokenized'] = seq

In [5]:
train.head()

Unnamed: 0,transcription,object,tokenized
0,Turn on the kitchen lights,lights,"[2, 5, 1, 11, 4]"
1,Turn up the temperature,heat,"[2, 6, 1, 9]"
2,OK now switch the main language to Chinese,Chinese,"[49, 50, 10, 1, 51, 14, 19, 41]"
3,Turn down the bathroom temperature,heat,"[2, 7, 1, 17, 9]"
4,Change the language,none,"[54, 1, 14]"


In [6]:
transcription_tagged = []
transcription_coded = []

for i in range(len(train['transcription'])):
    obj = train['object'].iloc[i]
    coded = []
    tagged = []
    for word in train['transcription'].iloc[i].split(" "):
        if word == obj:
            tagged.append("O")
            coded.append(1)
        else:
            tagged.append("N")
            coded.append(0)
            
    transcription_tagged.append(tagged)
    transcription_coded.append(coded)

In [7]:
train['tagged'] = transcription_tagged
train['coded'] = transcription_coded

In [8]:
train.head()

Unnamed: 0,transcription,object,tokenized,tagged,coded
0,Turn on the kitchen lights,lights,"[2, 5, 1, 11, 4]","[N, N, N, N, O]","[0, 0, 0, 0, 1]"
1,Turn up the temperature,heat,"[2, 6, 1, 9]","[N, N, N, N]","[0, 0, 0, 0]"
2,OK now switch the main language to Chinese,Chinese,"[49, 50, 10, 1, 51, 14, 19, 41]","[N, N, N, N, N, N, N, O]","[0, 0, 0, 0, 0, 0, 0, 1]"
3,Turn down the bathroom temperature,heat,"[2, 7, 1, 17, 9]","[N, N, N, N, N]","[0, 0, 0, 0, 0]"
4,Change the language,none,"[54, 1, 14]","[N, N, N]","[0, 0, 0]"


In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
def get_pad_tags(df):
    tags = df['coded'].tolist()
    pad_tags = pad_sequences(tags, maxlen = 12, dtype='int32', padding='post', value= 0)    
    return pad_tags

In [11]:
def get_pad_token(df):
    tokens = df['tokenized'].tolist()
    pad_tokens = pad_sequences(tokens, maxlen = 12, dtype='int32', padding='post', value = 0)
    
    return pad_tokens

In [12]:
pad_tags = get_pad_tags(train)
pad_tokens = get_pad_token(train)

In [13]:
pad_tags

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [14]:
pad_tokens

array([[ 2,  5,  1, ...,  0,  0,  0],
       [ 2,  6,  1, ...,  0,  0,  0],
       [49, 50, 10, ...,  0,  0,  0],
       ...,
       [24, 36,  0, ...,  0,  0,  0],
       [15, 79,  0, ...,  0,  0,  0],
       [ 2, 12,  1, ...,  0,  0,  0]], dtype=int32)

In [15]:
embedding_vector = {}
f = open('glove.6B.300d.txt')
from tqdm import tqdm
for line in f:
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

In [16]:
vocab_size = len(token.word_index)+1

embedding_matrix = np.zeros((vocab_size,300))
for word,i in token.word_index.items():
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [20]:
sample_weight = []

for sentence in train['coded']:
    sentence_weight = []
    for tag in sentence:
        if tag == 1:
            sentence_weight.append(10)
        else:
            sentence_weight.append(1)
            
    sample_weight.append(sentence_weight)

In [21]:
train['weights'] = sample_weight

In [22]:
train.head()

Unnamed: 0,transcription,object,tokenized,tagged,coded,weights
0,Turn on the kitchen lights,lights,"[2, 5, 1, 11, 4]","[N, N, N, N, O]","[0, 0, 0, 0, 1]","[1, 1, 1, 1, 10]"
1,Turn up the temperature,heat,"[2, 6, 1, 9]","[N, N, N, N]","[0, 0, 0, 0]","[1, 1, 1, 1]"
2,OK now switch the main language to Chinese,Chinese,"[49, 50, 10, 1, 51, 14, 19, 41]","[N, N, N, N, N, N, N, O]","[0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1, 1, 1, 10]"
3,Turn down the bathroom temperature,heat,"[2, 7, 1, 17, 9]","[N, N, N, N, N]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
4,Change the language,none,"[54, 1, 14]","[N, N, N]","[0, 0, 0]","[1, 1, 1]"


In [23]:
weight = train['weights'].tolist()
pad_weight = pad_sequences(weight, maxlen = 12, dtype='int32', padding='post', value = 1)

In [24]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [25]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights = [embedding_matrix], input_length = 12, trainable = False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(LSTM(units=64, dropout=0.5, return_sequences=True, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(128,activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'], sample_weight_mode = 'temporal')

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 300)           29700     
                                                                 
 bidirectional (Bidirectiona  (None, 12, 128)          186880    
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 12, 64)            49408     
                                                                 
 dropout (Dropout)           (None, 12, 64)            0         
                                                                 
 dense (Dense)               (None, 12, 128)           8320      
                                                                 
 dropout_1 (Dropout)         (None, 12, 128)           0         
                                                        

In [27]:
history_sample_weight = model.fit(np.array(pad_tokens), 
                        np.array(pad_tags), 
                        verbose=1, 
                        epochs=8,
                        sample_weight = pad_weight,
                        validation_split=0.2)

2023-05-08 00:57:09.407137: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [38]:
sent = 'Put on the music'
pred_tokens = token.texts_to_sequences([sent])
pad_seq_pred = pad_sequences(pred_tokens, maxlen=12, padding = 'post')

prediction = model.predict(pad_seq_pred)

In [39]:
def change_to_one_hot_encode(predicted_list):
    pred_one_hot_encode = []
    for sentence in predicted_list:
        indexes = np.argmax(sentence, axis = 1)
        sentence_encoded = []
        for index in indexes:
            one_hot_encode_word = np.zeros(2)
            one_hot_encode_word[index] = 1.0
            sentence_encoded.append(one_hot_encode_word)
            
        pred_one_hot_encode.append(sentence_encoded)
        
    return pred_one_hot_encode

In [40]:
change_to_one_hot_encode(prediction)

[[array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([0., 1.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.]),
  array([1., 0.])]]