In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('dataset/ask_play_station_preprocessed.csv', encoding= 'unicode_escape')
data.shape

(16701, 2)

In [3]:
data.dropna(inplace = True)

In [4]:
for index, row in data.iterrows():
        row[0] = str(row[0]).replace('Â\xa0', ' ', 1)
        row[1] = str(row[1]).replace('Â\xa0', ' ', 1)
        row[0] = str(row[0]).replace('Â\0xc2', ' ', 1)
        row[1] = str(row[1]).replace('Â\0xc2', ' ', 1)

# #data.User = data.User.astype(str)
# #data.Company = data.Company.astype(str)
# data.Company[25]

In [5]:
for index in data.index:
    data.loc[index,'Company'] = 'START ' + data.loc[index,'Company'] + ' END'
data.sample(10), data.Company[0]

(                                                    User  \
 353    @AskPlayStation Is your live chat support down...   
 2399   @AskPlayStation I even tried on a different TV...   
 3479       @AskPlayStation I can't add a game to my cart   
 6155   @AskPlayStation funds were added to my wallet ...   
 3314   @AskPlayStation Wireless connection. Error cod...   
 9202   @AskPlayStation Everytime I try to log into my...   
 3688   @AskPlayStation what is going on with my servi...   
 15194  @AskPlayStation @117014 We just buy a PS4, and...   
 5644   @AskPlayStation changed my password I have 2 s...   
 13834  @AskPlayStation my ps4 won't let me play any o...   
 
                                                  Company  
 353    START @135663 Odd! We have sent you a Direct M...  
 2399   START @197938 That's not good. Please check yo...  
 3479   START @217676 That's odd. Try making the purch...  
 6155   START @346390 Happy to help! For refund info, ...  
 3314   START @212973 No wo

In [6]:
company_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=40)
company_ds = tf.data.Dataset.from_tensor_slices(data.Company).batch(128)
company_vectorizer.adapt(company_ds)

In [7]:
user_vectorizer = TextVectorization(max_tokens=7000, output_sequence_length=40)
user_ds = tf.data.Dataset.from_tensor_slices(data.User).batch(128)
user_vectorizer.adapt(user_ds)

In [8]:
output = user_vectorizer(["So, what's the november ps plus free game"])
output.numpy()


array([[  54,  269,    5, 1235,   57,  133,  304,   35,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [9]:
output = company_vectorizer(["There is no info to share at the moment. Feel free to keep an eye on the PS Blog for news and updates: URL_POSITION"])
output.numpy()

array([[ 25,  56,  86,  67,   7, 194, 162,   4, 334, 138, 133,   7, 136,
         74, 471,  52,   4, 128, 260,   9, 343,  14, 275,   8,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]], dtype=int64)

In [10]:
print("Company length: " + str(len(company_vectorizer.get_vocabulary())))
print("User length: " + str(len(user_vectorizer.get_vocabulary())))

Company length: 7000
User length: 7000


In [11]:
company_vocabulary = company_vectorizer.get_vocabulary()
company_word_index = dict(zip(company_vocabulary, range(len(company_vocabulary))))

In [12]:
user_vocabulary = user_vectorizer.get_vocabulary()
user_word_index = dict(zip(user_vocabulary, range(len(company_vocabulary))))

In [13]:
len(user_word_index), len(company_word_index)

(7000, 7000)

In [14]:
test = ["start", "november", "ps", "plus", "free", "game"]
[company_word_index[w] for w in test]

[2, 832, 128, 256, 133, 101]

In [15]:
embeddings_index = {}
with open('C:/Users/Aleksandar/Desktop/glove/glove.6B.50d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [16]:
embeddings_index['what']

array([ 0.45323 ,  0.059811, -0.10577 , -0.333   ,  0.72359 , -0.08717 ,
       -0.61053 , -0.037695, -0.30945 ,  0.21805 , -0.43605 ,  0.47318 ,
       -0.76866 , -0.2713  ,  1.1042  ,  0.59141 ,  0.56962 , -0.18678 ,
        0.14867 , -0.67292 , -0.34672 ,  0.52284 ,  0.22959 , -0.072014,
        0.93967 , -2.3985  , -1.3238  ,  0.28698 ,  0.75509 , -0.76522 ,
        3.3425  ,  0.17233 , -0.51803 , -0.8297  , -0.29333 , -0.50076 ,
       -0.15228 ,  0.098973,  0.18146 , -0.1742  , -0.40666 ,  0.20348 ,
       -0.011788,  0.48252 ,  0.024598,  0.34064 , -0.084724,  0.5324  ,
       -0.25103 ,  0.62546 ], dtype=float32)

In [17]:
#Company GloVe embedding

company_num_tokens = len(company_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
company_embedding_matrix = np.zeros((company_num_tokens, embedding_dim))
for word, i in company_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        company_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 2607 words (4393 misses)


In [18]:
#User GloVe embedding

user_num_tokens = len(user_vocabulary)
embedding_dim = 50
hits = 0
misses = 0

# Prepare company embedding matrix
user_embedding_matrix = np.zeros((user_num_tokens, embedding_dim))
for word, i in user_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        user_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        #print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 5073 words (1927 misses)


In [19]:
#check user validity
user_embedding_matrix[133]

array([-0.014226  ,  1.28209996,  0.47413999,  0.029297  ,  0.2624    ,
        0.21472   , -0.1075    , -0.38361999,  0.17601   ,  0.13776   ,
       -0.38643   , -0.19752   ,  0.42192999, -0.047165  ,  0.56528997,
       -0.76681   , -0.077477  ,  0.24017   , -0.24187   , -0.68089002,
        0.25938001, -0.40561   ,  0.49706   ,  0.31424001, -1.04999995,
       -0.088827  , -0.1934    , -0.24862   ,  0.15663999, -0.04671   ,
        3.16820002,  0.76967001,  0.045547  ,  0.95493001,  0.53040999,
        0.29933   ,  0.23246001, -0.088557  ,  0.12864999, -0.4375    ,
        0.67809999,  0.12878001,  0.48137   , -0.065299  , -0.62515998,
        0.040249  ,  0.014061  ,  0.51809001, -0.308     ,  0.62830001])

In [20]:
#check company validity
company_embedding_matrix[193]

array([-0.026071  , -0.14204   ,  0.50678998, -0.38536   , -0.25992   ,
        0.061203  , -0.25150001,  0.33658999,  0.10031   ,  0.19701999,
       -0.072183  ,  0.13847999, -0.57571   , -0.56156999,  0.63119   ,
        1.02530003, -0.51130003, -1.01349998,  0.15967   , -0.39377001,
        0.20737   , -0.046717  , -0.38705   , -0.63292998,  0.46724001,
       -1.55929995,  0.32508999,  0.46072   ,  0.60162002,  1.28859997,
        1.81799996,  0.96003997,  1.30320001, -0.62168998, -0.42491999,
       -0.46419999, -1.30859995, -0.88731003,  0.28600001, -0.79233998,
       -0.88091999,  0.31139001,  0.28845999,  0.084298  ,  1.25150001,
        0.47628   ,  0.39539   ,  1.02499998,  0.28852999, -0.4567    ])

In [21]:
#company embedding
companny_embedding_layer = Embedding(
    company_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(company_embedding_matrix),
    trainable=False,
)

In [22]:
#user embedding
user_embedding_layer = Embedding(
    user_num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(user_embedding_matrix),
    trainable=False,
)

In [23]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedded_sequences = user_embedding_layer(encoder_inputs)
encoder_lstm = LSTM(embedding_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded_sequences)
encoder_states = [state_h, state_c]

In [24]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedded_sequences = companny_embedding_layer(decoder_inputs)

decoder_lstm = LSTM(embedding_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedded_sequences,
                                     initial_state=encoder_states)
decoder_dense = Dense(user_num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [25]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 50)     350000      ['input_1[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, None, 50)     350000      ['input_2[0][0]']                
                                                                                              

In [27]:
X = user_vectorizer(np.array([[s] for s in data.User])).numpy()
y = company_vectorizer(np.array([[s] for s in data.Company])).numpy()

In [28]:
X.shape, y.shape

((16701, 40), (16701, 40))

In [29]:
train_y_final_output = []
for i in y:
    train_y_final_output.append(i[1:])
train_y_final_output = pad_sequences(train_y_final_output, 40, padding='post', truncating='post')

In [30]:
len(train_y_final_output[0]), train_y_final_output[0], train_y_final_output.shape

(40,
 array([  1,  25,  56,  86,  67,   7, 194, 162,   4, 334, 138, 133,   7,
        136,  74, 471,  52,   4, 128, 260,   9, 343,  14, 275,   8,   3,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]),
 (16701, 40))

In [31]:
train_y_final_output = to_categorical(train_y_final_output)

In [32]:
train_y_final_output.shape

(16701, 40, 7000)

In [33]:
model.fit([X, y], train_y_final_output, epochs = 3, validation_split = 0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17b7680c730>

In [34]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= companny_embedding_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [35]:
def decode_sequence(input_seq):
    input_seq = user_vectorizer(input_seq)
    print(input_seq)
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = company_word_index['start']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = company_vocabulary[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 40):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [41]:
print(decode_sequence(["@AskPlayStation That seems to have fixed the problem. Thank you so much."]))

tf.Tensor(
[[  2  26 338   6  16 299   5  65 137  21  54 386   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]], shape=(1, 40), dtype=int64)
 [UNK] please check your dms for further instructions
