In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from keras.layers import Input, GRU, Embedding, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import re

In [2]:
data = pd.read_csv('dataset/mixed_data_preprocessed_fixed.csv', encoding= 'unicode_escape')
data.shape

(18668, 2)

In [3]:
data.dropna(inplace = True)

In [5]:
for index, row in data.iterrows():
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc2', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc2', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc3', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc3', ' ', 1)
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace(' â\x89\xa0 ', ' ', 1)
        row[1] = str(row[1]).replace(' â\x89\xa0 ', ' ', 1)

In [6]:
for index in data.index:
    data.loc[index,'Company'] = 'START ' + data.loc[index,'Company'] + ' END'
data.sample(10), data.Company[0]

(                                                 Company  \
 4538   START @143223 Jim, did you make a delayed bagg...   
 15068  START @120961 Can you try uninstalling the App...   
 13074  START @152376 Hello, would you be able to dire...   
 6315   START @138976 We're unable to give you a ticke...   
 14127  START @118023 That's not what we like to hear,...   
 7324   START @148368 We have deals on deal on deals, ...   
 15017  START @155030 Sorry for any frustration with y...   
 7736   START @125954 Hey, there. I would be happy to ...   
 4547   START @143226 You are very welcome. Enjoy your...   
 3177   START @127424 Hey! Can you DM us your account'...   
 
                                                     User  
 4538   @Delta that being said, the young lady that to...  
 15068  @sainsburys pls help. App isn't working. Check...  
 13074  @XboxSupport I'm an Xbox gold member but I can...  
 6315   @Delta @AmericanAir ok whoever gives me a plan...  
 14127  @hulu_support Canno

In [7]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

In [8]:
company_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=40)
company_ds = tf.data.Dataset.from_tensor_slices(train_data.Company).batch(128)
company_vectorizer.adapt(company_ds)

In [9]:
user_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=40)
user_ds = tf.data.Dataset.from_tensor_slices(train_data.User).batch(128)
user_vectorizer.adapt(user_ds)

In [10]:
print("Company length: " + str(len(company_vectorizer.get_vocabulary())))
print("User length: " + str(len(user_vectorizer.get_vocabulary())))

Company length: 7000
User length: 7000


In [11]:
company_vocabulary = company_vectorizer.get_vocabulary()
company_word_index = dict(zip(company_vocabulary, range(len(company_vocabulary))))

In [12]:
user_vocabulary = user_vectorizer.get_vocabulary()
user_word_index = dict(zip(user_vocabulary, range(len(company_vocabulary))))

In [13]:
len(user_word_index), len(company_word_index)

(7000, 7000)

In [14]:
embeddings_index = {}
with open('glove/glove.6B.50d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [15]:
#Company GloVe embedding

company_num_tokens = len(company_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
company_embedding_matrix = np.zeros((company_num_tokens, embedding_dim))
for word, i in company_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        company_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 4465 words (2535 misses)


In [16]:
#User GloVe embedding

user_num_tokens = len(user_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
user_embedding_matrix = np.zeros((user_num_tokens, embedding_dim))
for word, i in user_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        user_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 6167 words (833 misses)


In [17]:
#company embedding
companny_embedding_layer = Embedding(
    company_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(company_embedding_matrix),
    trainable=False,
)

In [18]:
#user embedding
user_embedding_layer = Embedding(
    user_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(user_embedding_matrix),
    trainable=False,
)

In [19]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedded_sequences = user_embedding_layer(encoder_inputs)
encoder_gru = GRU(embedding_dim, return_state=True)
encoder_outputs, encoder_states = encoder_gru(encoder_embedded_sequences)
#encoder_states = [state_h, state_c]

In [20]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_sequences = companny_embedding_layer(decoder_inputs)

decoder_gru = GRU(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_embedded_sequences,
                                     initial_state=encoder_states)
decoder_dense = Dense(user_num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [21]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     350000      ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, None, 50)     350000      ['input_2[0][0]']                
                                                                                              

In [22]:
X = user_vectorizer(np.array([[s] for s in train_data.User])).numpy()
y = company_vectorizer(np.array([[s] for s in train_data.Company])).numpy()
X.shape, y.shape

((13067, 40), (13067, 40))

In [23]:
train_y_final_output = []
for i in y:
    train_y_final_output.append(i[1:])
train_y_final_output = pad_sequences(train_y_final_output, 40, padding='post', truncating='post')

In [24]:
train_y_final_output = to_categorical(train_y_final_output)
train_y_final_output.shape

(13067, 40, 7000)

In [25]:
model.fit([X, y], train_y_final_output, epochs = 20, validation_split = 0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x211eb5fc310>

In [33]:
model.save_weights('weights/mixed_gru_glove.h5')

In [26]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
#decoder_state_input_h = Input(shape=(50,))
#decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = Input(shape=(50,))#[decoder_state_input_h, decoder_state_input_c]

dec_emb2= companny_embedding_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, decoder_states2 = decoder_gru(dec_emb2, initial_state=decoder_states_inputs)
#decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_states_inputs],
    [decoder_outputs2] + [decoder_states2])

In [28]:
def decode_sequence(input_seq):
    input_seq = user_vectorizer(input_seq)
    #print(input_seq)
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = company_word_index['start']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    repeat = 0
    while not stop_condition:
        output_tokens, states_value = decoder_model.predict([target_seq] + [states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = company_vocabulary[sampled_token_index]
        prev = decoded_sentence
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (prev.rstrip() == decoded_sentence.rstrip()):
            repeat = repeat + 1
        else:
            repeat = 0
        
        if (sampled_char == 'end' or
           len(decoded_sentence) > 40):
            stop_condition = True
        if repeat > 5:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        #states_value = [h, c]

    return decoded_sentence

In [29]:
for index, row in test_data[:200].iterrows():
    print(decode_sequence([row['User']]))

 [UNK] hi there can you dm us your accounts
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] hi there sorry to hear this was the
 [UNK] hey there can you dm us your accounts
 [UNK] hey there can you dm us your accounts
 [UNK] we want to help with your internet
 [UNK] hey there can you dm us your accounts
 [UNK] we want to help with your internet
 [UNK] hi there can you dm us your accounts
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] we can help with your internet issues
 [UNK] hi there can you dm us your accounts
 [UNK] hi [UNK] sorry to hear this is the
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] we are working to help with your internet
 [UNK] we are working to help with your internet
 [UNK] hey there can you dm us your accounts
 [UNK] hey there can

 [UNK] hi there can you dm us your accounts
 [UNK] we are able to help with your internet
 [UNK] hey there can you dm us your accounts
 [UNK] we want to help with your internet
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] we can help with your internet issues
 [UNK] we want to help with your internet
 [UNK] hey there can you dm us your accounts
 [UNK] hi there can you dm us your accounts
 [UNK] we want to help with your internet
 [UNK] we want to help with your internet
