In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('dataset/ask_play_station_preprocessed.csv', encoding= 'unicode_escape')
data.shape

(16701, 2)

In [3]:
data.dropna(inplace = True)

In [4]:
for index, row in data.iterrows():
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc2', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc2', ' ', 1)

# #data.User = data.User.astype(str)
# #data.Company = data.Company.astype(str)
# data.Company[25]

In [5]:
for index in data.index:
    data.loc[index,'Company'] = 'START ' + data.loc[index,'Company'] + ' END'
data.sample(10), data.Company[0]

(                                                    User  \
 3556   @AskPlayStation my PS4 keeps putting itself in...   
 41     @AskPlayStation I redeemed a VC code for nba 2...   
 6992                         @AskPlayStation Nw- 31291-6   
 10502  @AskPlayStation Why is it that every time I tr...   
 5813   @AskPlayStation when i try to accept the terms...   
 11670  @AskPlayStation hi, chat and social thingies d...   
 13624  @AskPlayStation the page  it told me to contac...   
 9557   @AskPlayStation yo, my ps4 connects to the int...   
 10262  @AskPlayStation  My password got randomly chan...   
 15941  @AskPlayStation what screw driver size shall i...   
 
                                                  Company  
 3556   START @218804 That's not good. Please turn off...  
 41     START @117009  No problem. Please follow the s...  
 6992   START @376798 Let's check out the next article...  
 10502  START @501406 Hi there. Let's look into that. ...  
 5813   START @316571 Stran

In [10]:
company_vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=40)
company_ds = tf.data.Dataset.from_tensor_slices(data.Company).batch(128)
company_vectorizer.adapt(company_ds)

In [11]:
user_vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=40)
user_ds = tf.data.Dataset.from_tensor_slices(data.User).batch(128)
user_vectorizer.adapt(user_ds)

In [12]:
output = user_vectorizer(["So, what's the november ps plus free game"])
output.numpy()


array([[  54,  269,    5, 1235,   57,  133,  304,   35,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [13]:
output = company_vectorizer(["There is no info to share at the moment. Feel free to keep an eye on the PS Blog for news and updates: URL_POSITION"])
output.numpy()

array([[ 25,  56,  86,  67,   7, 194, 162,   4, 334, 138, 133,   7, 136,
         74, 471,  52,   4, 128, 260,   9, 343,  14, 275,   8,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]], dtype=int64)

In [14]:
print("Company length: " + str(len(company_vectorizer.get_vocabulary())))
print("User length: " + str(len(user_vectorizer.get_vocabulary())))

Company length: 10000
User length: 10000


In [15]:
company_vocabulary = company_vectorizer.get_vocabulary()
company_word_index = dict(zip(company_vocabulary, range(len(company_vocabulary))))

In [16]:
user_vocabulary = user_vectorizer.get_vocabulary()
user_word_index = dict(zip(user_vocabulary, range(len(company_vocabulary))))

In [17]:
len(user_word_index), len(company_word_index)

(10000, 10000)

In [18]:
test = ["start", "november", "ps", "plus", "free", "game"]
[company_word_index[w] for w in test]

[2, 832, 128, 256, 133, 101]

In [20]:
embeddings_index = {}
with open('glove/glove.6B.50d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [21]:
embeddings_index['what']

array([ 0.45323 ,  0.059811, -0.10577 , -0.333   ,  0.72359 , -0.08717 ,
       -0.61053 , -0.037695, -0.30945 ,  0.21805 , -0.43605 ,  0.47318 ,
       -0.76866 , -0.2713  ,  1.1042  ,  0.59141 ,  0.56962 , -0.18678 ,
        0.14867 , -0.67292 , -0.34672 ,  0.52284 ,  0.22959 , -0.072014,
        0.93967 , -2.3985  , -1.3238  ,  0.28698 ,  0.75509 , -0.76522 ,
        3.3425  ,  0.17233 , -0.51803 , -0.8297  , -0.29333 , -0.50076 ,
       -0.15228 ,  0.098973,  0.18146 , -0.1742  , -0.40666 ,  0.20348 ,
       -0.011788,  0.48252 ,  0.024598,  0.34064 , -0.084724,  0.5324  ,
       -0.25103 ,  0.62546 ], dtype=float32)

In [22]:
#Company GloVe embedding

company_num_tokens = len(company_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
company_embedding_matrix = np.zeros((company_num_tokens, embedding_dim))
for word, i in company_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        company_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 2614 words (7386 misses)


In [23]:
#User GloVe embedding

user_num_tokens = len(user_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
user_embedding_matrix = np.zeros((user_num_tokens, embedding_dim))
for word, i in user_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        user_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 6527 words (3473 misses)


In [24]:
#check user validity
user_embedding_matrix[133]

array([-0.014226  ,  1.28209996,  0.47413999,  0.029297  ,  0.2624    ,
        0.21472   , -0.1075    , -0.38361999,  0.17601   ,  0.13776   ,
       -0.38643   , -0.19752   ,  0.42192999, -0.047165  ,  0.56528997,
       -0.76681   , -0.077477  ,  0.24017   , -0.24187   , -0.68089002,
        0.25938001, -0.40561   ,  0.49706   ,  0.31424001, -1.04999995,
       -0.088827  , -0.1934    , -0.24862   ,  0.15663999, -0.04671   ,
        3.16820002,  0.76967001,  0.045547  ,  0.95493001,  0.53040999,
        0.29933   ,  0.23246001, -0.088557  ,  0.12864999, -0.4375    ,
        0.67809999,  0.12878001,  0.48137   , -0.065299  , -0.62515998,
        0.040249  ,  0.014061  ,  0.51809001, -0.308     ,  0.62830001])

In [25]:
#check company validity
company_embedding_matrix[193]

array([-0.026071  , -0.14204   ,  0.50678998, -0.38536   , -0.25992   ,
        0.061203  , -0.25150001,  0.33658999,  0.10031   ,  0.19701999,
       -0.072183  ,  0.13847999, -0.57571   , -0.56156999,  0.63119   ,
        1.02530003, -0.51130003, -1.01349998,  0.15967   , -0.39377001,
        0.20737   , -0.046717  , -0.38705   , -0.63292998,  0.46724001,
       -1.55929995,  0.32508999,  0.46072   ,  0.60162002,  1.28859997,
        1.81799996,  0.96003997,  1.30320001, -0.62168998, -0.42491999,
       -0.46419999, -1.30859995, -0.88731003,  0.28600001, -0.79233998,
       -0.88091999,  0.31139001,  0.28845999,  0.084298  ,  1.25150001,
        0.47628   ,  0.39539   ,  1.02499998,  0.28852999, -0.4567    ])

In [26]:
#company embedding
companny_embedding_layer = Embedding(
    company_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(company_embedding_matrix),
    trainable=False,
)

In [27]:
#user embedding
user_embedding_layer = Embedding(
    user_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(user_embedding_matrix),
    trainable=False,
)

In [28]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedded_sequences = user_embedding_layer(encoder_inputs)
encoder_lstm = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded_sequences)
encoder_states = [state_h, state_c]

In [29]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_sequences = companny_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedded_sequences,
                                     initial_state=encoder_states)
decoder_dense = Dense(user_num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [30]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     500000      ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, None, 50)     500000      ['input_2[0][0]']                
                                                                                              

In [32]:
X = user_vectorizer(np.array([[s] for s in data.User])).numpy()
y = company_vectorizer(np.array([[s] for s in data.Company])).numpy()

In [33]:
X.shape, y.shape

((16701, 40), (16701, 40))

In [34]:
train_y_final_output = []
for i in y:
    train_y_final_output.append(i[1:])
train_y_final_output = pad_sequences(train_y_final_output, 40, padding='post', truncating='post')

In [35]:
len(train_y_final_output[0]), train_y_final_output[0], train_y_final_output.shape

(40,
 array([  1,  25,  56,  86,  67,   7, 194, 162,   4, 334, 138, 133,   7,
        136,  74, 471,  52,   4, 128, 260,   9, 343,  14, 275,   8,   3,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]),
 (16701, 40))

In [36]:
train_y_final_output = to_categorical(train_y_final_output)

In [37]:
train_y_final_output.shape

(16701, 40, 10000)

In [38]:
model.fit([X, y], train_y_final_output, epochs = 15, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x16f500bb6d0>

In [39]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= companny_embedding_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [40]:
def decode_sequence(input_seq):
    input_seq = user_vectorizer(input_seq)
    print(input_seq)
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = company_word_index['start']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = company_vocabulary[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 40):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [42]:
print(decode_sequence(["@AskPlayStation Thank you💖you are awesome"]))

tf.Tensor(
[[   2  137    1   95 1409    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]], shape=(1, 40), dtype=int64)
 please check your dms for more instructions
