In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import re

In [8]:
#data = pd.read_csv('dataset/ask_play_station_preprocessed.csv', encoding= 'unicode_escape')
data = pd.read_csv('dataset/mixed_data_preprocessed_fixed.csv', encoding= 'unicode_escape')
data.shape

(18668, 2)

In [9]:
data.dropna(inplace = True)

In [11]:
for index, row in data.iterrows():
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc2', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc2', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc3', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc3', ' ', 1)
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace(' â\x89\xa0 ', ' ', 1)
        row[1] = str(row[1]).replace(' â\x89\xa0 ', ' ', 1)

# #data.User = data.User.astype(str)
# #data.Company = data.Company.astype(str)
#data.Company[36]

In [12]:
for index in data.index:
    data.loc[index,'Company'] = 'START ' + data.loc[index,'Company'] + ' END'
data.sample(10), data.Company[0]

(                                                 Company  \
 18444        START @118570 Thanks for the kudos! -AC END   
 9586   START @124040 Thanks, Katy! We will share you ...   
 4411   START @138981 I encourage you to contact them ...   
 9831   START @130455 We're looking forward to having ...   
 9111   START @133466 Hi Chevy. We hope you have a gre...   
 391    START @119532 Thanks for keeping us posted. I'...   
 2663   START @130279 Riders are selected at random fo...   
 15698  START @133736 Hi Neil, could you confirm which...   
 3366   START @131002 Hmm. Can you tell us when this s...   
 15316  START @124384 You're very welcome! Please be a...   
 
                                                     User  
 18444  @3226 @ChipotleTweets Becky was a first rate h...  
 9586   @SouthwestAir Of course! It was Flight 278 CLE...  
 4411   @Delta I did. They wonât help. I did not see...  
 9831   Always awesome flying home to Denver over the ...  
 9111   About to fly to Lon

In [13]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

In [14]:
company_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=20)
company_ds = tf.data.Dataset.from_tensor_slices(train_data.Company).batch(128)
company_vectorizer.adapt(company_ds)

In [15]:
user_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=20)
user_ds = tf.data.Dataset.from_tensor_slices(train_data.User).batch(128)
user_vectorizer.adapt(user_ds)

In [16]:
output = user_vectorizer(["So, what's the november ps plus free game"])
output.numpy()


array([[  32,  292,    2, 1056,  625,  485,  206,  208,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int64)

In [17]:
output = company_vectorizer(["There is no info to share at the moment. Feel free to keep an eye on the PS Blog for news and updates: URL_POSITION"])
output.numpy()

array([[  36,   26,   92,   98,    5,  145,   44,    6,  295,  174,  194,
           5,  184,   80,  591,   19,    6, 1024, 2452,    8]],
      dtype=int64)

In [18]:
print("Company length: " + str(len(company_vectorizer.get_vocabulary())))
print("User length: " + str(len(user_vectorizer.get_vocabulary())))

Company length: 7000
User length: 7000


In [19]:
company_vocabulary = company_vectorizer.get_vocabulary()
company_word_index = dict(zip(company_vocabulary, range(len(company_vocabulary))))

In [20]:
user_vocabulary = user_vectorizer.get_vocabulary()
user_word_index = dict(zip(user_vocabulary, range(len(company_vocabulary))))

In [21]:
len(user_word_index), len(company_word_index)

(7000, 7000)

In [22]:
test = ["start", "november", "ps", "plus", "free", "game"]
[company_word_index[w] for w in test]

[2, 1027, 1024, 1119, 194, 411]

In [23]:
embeddings_index = {}
with open('glove/glove.6B.50d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [24]:
embeddings_index['what']

array([ 0.45323 ,  0.059811, -0.10577 , -0.333   ,  0.72359 , -0.08717 ,
       -0.61053 , -0.037695, -0.30945 ,  0.21805 , -0.43605 ,  0.47318 ,
       -0.76866 , -0.2713  ,  1.1042  ,  0.59141 ,  0.56962 , -0.18678 ,
        0.14867 , -0.67292 , -0.34672 ,  0.52284 ,  0.22959 , -0.072014,
        0.93967 , -2.3985  , -1.3238  ,  0.28698 ,  0.75509 , -0.76522 ,
        3.3425  ,  0.17233 , -0.51803 , -0.8297  , -0.29333 , -0.50076 ,
       -0.15228 ,  0.098973,  0.18146 , -0.1742  , -0.40666 ,  0.20348 ,
       -0.011788,  0.48252 ,  0.024598,  0.34064 , -0.084724,  0.5324  ,
       -0.25103 ,  0.62546 ], dtype=float32)

In [25]:
#Company GloVe embedding

company_num_tokens = len(company_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
company_embedding_matrix = np.zeros((company_num_tokens, embedding_dim))
for word, i in company_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        company_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 4465 words (2535 misses)


In [26]:
#User GloVe embedding

user_num_tokens = len(user_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
user_embedding_matrix = np.zeros((user_num_tokens, embedding_dim))
for word, i in user_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        user_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 6167 words (833 misses)


In [27]:
#check user validity
user_embedding_matrix[133]

array([ 1.48279995e-01,  1.77609995e-01,  4.23460007e-01, -3.14889997e-01,
        3.22730005e-01, -7.24129975e-01, -7.89550006e-01,  4.92139995e-01,
       -2.06929997e-01, -5.50879980e-04, -4.78769988e-01,  2.88529992e-01,
       -5.73759973e-01,  2.72170007e-01,  1.11290002e+00,  5.78079998e-01,
        6.93210006e-01, -2.86520004e-01, -5.45450002e-02, -6.18260026e-01,
        1.72270000e-01,  2.92629987e-01,  3.81839991e-01,  6.21860027e-01,
        5.54610014e-01, -1.74109995e+00, -2.88020015e-01, -1.71399996e-01,
        7.47430027e-01, -1.01349998e+00,  3.35960007e+00,  1.13699996e+00,
       -1.00279999e+00,  1.76850006e-01, -6.17949991e-03, -6.34910017e-02,
        1.90770000e-01,  4.40459996e-02,  3.82279992e-01, -4.16070014e-01,
       -5.03589988e-01, -8.38029981e-02,  1.75080001e-01,  4.04199988e-01,
        7.73240030e-02,  1.74150005e-01,  1.25410005e-01, -2.18199998e-01,
        1.29710004e-01,  3.29530001e-01])

In [28]:
#check company validity
company_embedding_matrix[193]

array([ 0.49708   ,  0.054785  ,  0.86637998,  0.46548   , -0.95643002,
        0.08187   , -0.004151  , -0.069125  , -1.70000005,  1.26129997,
        1.29079998, -0.14752001, -1.38170004,  0.083292  , -0.12346   ,
       -0.33599001, -0.44850001,  0.38988   , -1.13240004, -0.36943999,
       -0.73693001, -0.57831001, -0.61009002,  1.66980004, -0.53049999,
       -0.12488   ,  1.15690005,  0.31060001, -0.52116001,  0.33651999,
        1.81229997,  1.34099996,  0.23122001,  0.12511   ,  0.048984  ,
        0.30794999,  0.67546999,  0.66725999,  0.1531    , -0.60719001,
        2.01889992,  0.50082999, -0.73434001, -0.32253999, -0.78384   ,
        1.16390002,  0.33465001,  0.029798  ,  0.78741002, -0.48907   ])

In [29]:
#company embedding
companny_embedding_layer = Embedding(
    company_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(company_embedding_matrix),
    trainable=False,
)

In [30]:
#user embedding
user_embedding_layer = Embedding(
    user_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(user_embedding_matrix),
    trainable=False,
)

In [31]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedded_sequences = user_embedding_layer(encoder_inputs)
encoder_lstm = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded_sequences)
encoder_states = [state_h, state_c]

In [32]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_sequences = companny_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedded_sequences,
                                     initial_state=encoder_states)
decoder_dense = Dense(user_num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [33]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [34]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     350000      ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, None, 50)     350000      ['input_2[0][0]']                
                                                                                              

In [35]:
X = user_vectorizer(np.array([[s] for s in train_data.User])).numpy()
y = company_vectorizer(np.array([[s] for s in train_data.Company])).numpy()

In [36]:
X.shape, y.shape

((13067, 20), (13067, 20))

In [37]:
train_y_final_output = []
for i in y:
    train_y_final_output.append(i[1:])
train_y_final_output = pad_sequences(train_y_final_output, 20, padding='post', truncating='post')

In [38]:
len(train_y_final_output[0]), train_y_final_output[0], train_y_final_output.shape

(20,
 array([  1,  65,   5,  72,  61,   4,  11,   4,  16,  35,  14,   9,  17,
        137,  95,  18,   7,  68,  47,   0]),
 (13067, 20))

In [39]:
train_y_final_output = to_categorical(train_y_final_output)

In [40]:
train_y_final_output.shape

(13067, 20, 7000)

In [42]:
model.fit([X, y], train_y_final_output, epochs = 15, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1e1051647f0>

In [43]:
model.save_weights('weights/mixed_lstm_glove.h5')

In [44]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= companny_embedding_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [45]:
def decode_sequence(input_seq):
    input_seq = user_vectorizer(input_seq)
    #print(input_seq)
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = company_word_index['start']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    repeat = 0
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = company_vocabulary[sampled_token_index]
        prev = decoded_sentence
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (prev.rstrip() == decoded_sentence.rstrip()):
            repeat = repeat + 1
        else:
            repeat = 0
        
        if (sampled_char == 'end' or
           len(decoded_sentence) > 20):
            stop_condition = True
        if repeat > 2:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [46]:
print(decode_sequence(["@AskPlayStation Thank you💖you are awesome"]))

 [UNK] we can help with


In [47]:
for index, row in test_data[:200].iterrows():
    print(decode_sequence([row['User']]))

 [UNK] we want to hear
 [UNK] we can help with
 [UNK] hi there sorry
 [UNK] hi there sorry
 [UNK] we can help with
 [UNK] we can help with
 [UNK] we can be a look
 [UNK] hey there we can
 [UNK] we appreciate your
 [UNK] we can help with
 [UNK] hey there we can
 [UNK] hi [UNK] sorry
 [UNK] hey there can you
 [UNK] hi there sorry
 [UNK] hi there sorry
 [UNK] hey there we can
 [UNK] hi there sorry
 [UNK] hey there we can
 [UNK] hi there sorry
 [UNK] hi there can you
 [UNK] we can help with
 [UNK] hi there can you
 [UNK] hey there we can
 [UNK] we can help with
 [UNK] we can help to
 [UNK] i apologize for
 [UNK] hi there sorry
 [UNK] hey there we can
 [UNK] i apologize for
 [UNK] hi there sorry
 [UNK] we can help with
 [UNK] hi [UNK] sorry
 [UNK] hi there sorry
 [UNK] hi there can you
 [UNK] we can help with
 [UNK] hey there can you
 [UNK] we can help with
 [UNK] i apologize for
 [UNK] we can help with
 [UNK] we can help with
 [UNK] we can help you
 [UNK] we can help with
 [UNK] we can hel