In [1]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("dataset/ask_play_station.csv")
df.dropna(inplace = True)
df

Unnamed: 0,User,Company
0,"@AskPlayStation So, what's the november ps plu...",@115743 There is no info to share at the momen...
1,@AskPlayStation It was when I would go to down...,"@115745 Glad to know that is downloading, ple..."
2,@AskPlayStation I bought Lego Star Wars in the...,@115745 Sorry for the inconvenience. Do you se...
3,@AskPlayStation can i block a community from s...,@115746 Glad to help. Please share the detai...
4,@AskPlayStation can you dm me I have a question,@116001 Glad to help! We have sent you a DM lo...
...,...,...
16711,@AskPlayStation I have already read and went o...,@640122 Please run a network connection test. ...
16712,@AskPlayStation I can not fully connect to my ...,@640122 Let's take a look! Check out the next ...
16713,@AskPlayStation i want to buy a ps4 pro . will...,@823562 Hello there. There isn't information a...
16714,"@AskPlayStation any idea when ""Steven Universe...",@823563 Glad to help! Please follow us via Tw...


In [3]:
df = pd.read_csv("dataset/amazon_data.csv")
df.dropna(inplace = True)
df = df[:15000]
df

Unnamed: 0,User,Company
0,Way to drop the ball on customer service @1158...,@115820 I'm sorry we've let you down! Without ...
1,@AmazonHelp 3 different people have given 3 di...,@115820 We'd like to take a further look into ...
2,@115823 I want my amazon payments account CLOS...,@115822 I am unable to affect your account via...
3,@115828 How about you guys figure out my Xbox ...,@115826 I'm sorry for the wait. You'll receive...
4,@AmazonHelp @115826 Yeah this is crazy we’re l...,@115827 Thanks for your patience. ^KM
...,...,...
15037,I'm so mad at @115821 rn.\n\nThe katana sword ...,@183848 I'm so sorry for the delay! We strive ...
15038,@AmazonHelp @183848 WORST SERVICE\nPLZ REPLY T...,@121400 We've responded to your DM as requeste...
15039,"@AmazonHelp Actually, email says Friday, but w...",@183848 Some items are not available to ship i...
15040,@AmazonHelp Why would I even be given an optio...,@183848 Our most accurate delivery date will b...


In [3]:
for index in df.index:
    df.loc[index,'Company'] = '<SOS> ' + df.loc[index,'Company'] + ' <EOS>'
df

Unnamed: 0,User,Company
0,"@AskPlayStation So, what's the november ps plu...",<SOS> @115743 There is no info to share at the...
1,@AskPlayStation It was when I would go to down...,<SOS> @115745 Glad to know that is downloadin...
2,@AskPlayStation I bought Lego Star Wars in the...,<SOS> @115745 Sorry for the inconvenience. Do ...
3,@AskPlayStation can i block a community from s...,<SOS> @115746 Glad to help. Please share the...
4,@AskPlayStation can you dm me I have a question,<SOS> @116001 Glad to help! We have sent you a...
...,...,...
16711,@AskPlayStation I have already read and went o...,<SOS> @640122 Please run a network connection ...
16712,@AskPlayStation I can not fully connect to my ...,<SOS> @640122 Let's take a look! Check out the...
16713,@AskPlayStation i want to buy a ps4 pro . will...,<SOS> @823562 Hello there. There isn't informa...
16714,"@AskPlayStation any idea when ""Steven Universe...",<SOS> @823563 Glad to help! Please follow us ...


In [4]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
text_data = np.concatenate((train_data['User'].values, train_data['Company'].values))
text_data

array(["@AskPlayStation I'm having some problems with my acount please May you DM me",
       "@AskPlayStation I've been waiting 5 hours and it's not even half way to being able to start the application. No where near being done.The game is Dragon Age: Inquisition: GOTY edition.",
       '@AskPlayStation @90688   i cant download ep 2 :( i purchased ep 1 fine. Anywayy to fix this? https://t.co/TP5E7QhZQ0',
       ...,
       '<SOS> @317733 Sorry to know that. Please power cycle your network devices and try again:\xa0https://t.co/UUMNDRIFMj <EOS>',
       '<SOS> @161110 We can help with that! Please follow us on Twitter and send us a DM so we can assist you further! https://t.co/blzF3DE7ws <EOS>',
       '<SOS> @777362 Sorry to know that! Please check the following article with more information about requesting a refund: https://t.co/UYBWwzvFok <EOS>'],
      dtype=object)

In [6]:
MAX_NB_WORDS = 7000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 19609 unique tokens.


In [7]:
word_counts  = tokenizer.word_counts 
rare_words_number = 0 
for index, (key, value) in enumerate(word_counts.items()):
    if (value < 2):
        rare_words_number = rare_words_number + 1
        #print( index, key, value)
rare_words_number

12571

In [8]:
train_data['User'].values[0]

"@AskPlayStation I'm having some problems with my acount please May you DM me"

In [9]:
seq = tokenizer.texts_to_sequences([train_data['User'].values[0]])
seq

[[5, 136, 124, 189, 315, 25, 11, 1326, 8, 397, 7, 72, 48]]

In [10]:
tokenizer.sequences_to_texts(seq)

["askplaystation i'm having some problems with my acount please may you dm me"]

In [11]:
train_X = tokenizer.texts_to_sequences(train_data['User'].values)
train_y = tokenizer.texts_to_sequences(train_data['Company'].values)

In [12]:
train_X[0]

[5, 136, 124, 189, 315, 25, 11, 1326, 8, 397, 7, 72, 48]

In [13]:
MAX_QUESTION_LENGTH = 50
#for el in df['User']:
 #   if(len(el) > MAX_QUESTION_LENGTH):
#        MAX_QUESTION_LENGTH = len(el)
print(MAX_QUESTION_LENGTH)

MAX_ANSWER_LENGTH = 50
#for el in df['Company']:
    #if(len(el) > MAX_ANSWER_LENGTH):
        #MAX_ANSWER_LENGTH = len(el)
print(MAX_ANSWER_LENGTH)

50
50


In [14]:
train_X = pad_sequences(train_X, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post') #truncating dodati ako budem hteo avrage da koristim
train_y = pad_sequences(train_y, maxlen=MAX_ANSWER_LENGTH, padding='post', truncating='post')

In [15]:
len(train_X[0])

50

In [16]:
train_X[0][:50]

array([   5,  136,  124,  189,  315,   25,   11, 1326,    8,  397,    7,
         72,   48,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [17]:
train_y_final_output = []
for i in train_y:
    train_y_final_output.append(i[1:]) 

train_y_final_output = pad_sequences(train_y_final_output, MAX_ANSWER_LENGTH, padding='post', truncating='post')

In [18]:
train_y_final_output[0][:50]

array([45,  2, 50, 17,  8, 42, 18, 51, 55, 41, 27, 19, 59,  7, 10, 72, 25,
       29, 37, 64,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [19]:
train_y[0][:50]

array([ 3, 45,  2, 50, 17,  8, 42, 18, 51, 55, 41, 27, 19, 59,  7, 10, 72,
       25, 29, 37, 64,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [20]:
from tensorflow.keras.utils import to_categorical
train_y_final_output = to_categorical(train_y_final_output, MAX_NB_WORDS)

In [21]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input


enc_inp = Input(shape=(50, ))
dec_inp = Input(shape=(50, ))

In [22]:
VOCAB_SIZE = MAX_NB_WORDS
embed = Embedding(VOCAB_SIZE+1, output_dim=50, 
                  input_length=50,
                  trainable=True                  
                  )

In [23]:
enc_embed = embed(enc_inp)
enc_lstm = LSTM(200, return_sequences=True, return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]


In [24]:
embed2 = Embedding(VOCAB_SIZE+1, output_dim=50, 
                  input_length=50,
                  trainable=True                  
                  )

In [25]:
dec_embed = embed2(dec_inp)
dec_lstm = LSTM(200, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

In [26]:
dense = Dense(VOCAB_SIZE, activation='softmax')
dense_op = dense(dec_op)
model = Model([enc_inp, dec_inp], dense_op)
model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 50)       350050      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 50, 50)       350050      ['input_2[0][0]']                
                                                                                              

In [27]:
model.eval()

AttributeError: 'Functional' object has no attribute 'eval'

In [233]:
model.fit([train_X, train_y],train_y_final_output,epochs=3, validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x26f991f3a90>

In [234]:
enc_model = Model(enc_inp, enc_states)

In [235]:
decoder_state_input_h = Input(shape=(200,))
decoder_state_input_c = Input(shape=(200,))

In [236]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [237]:
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                    initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
#decoder_outputs = dense(decoder_outputs)

In [238]:
dec_model = Model([dec_inp]+ decoder_states_inputs,
                                      [decoder_outputs]+ decoder_states)

In [266]:
test_data['User'].values[7]

'@AmazonHelp Just did. Thanks'

In [267]:
test_question = tokenizer.texts_to_sequences([test_data['User'].values[7]])

#test_question = tokenizer.texts_to_sequences([train_data['User'].values[0]])
test_question

[[7, 90, 116, 98]]

In [268]:
test_X = pad_sequences(test_question, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post')
test_X

array([[  7,  90, 116,  98,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [273]:
test_answer = enc_model.predict( test_X )

In [274]:
empty_target_seq = np.zeros( ( 1 , 1) )
empty_target_seq

array([[0.]])

In [275]:
empty_target_seq[0, 0] = tokenizer.texts_to_sequences(['sos'])[0][0]
empty_target_seq

array([[3.]])

In [276]:
stop_condition = False
decoded_translation=''
i = 0
while not stop_condition :
    dec_outputs , h, c= dec_model.predict([empty_target_seq] + test_answer )
    decoder_concat_input = dense(dec_outputs)
    sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
    
    if i == 0:
        i = 1
        indes = np.argpartition(decoder_concat_input[0, -1, :], -10)[-10:]
        indes = tokenizer.sequences_to_texts([indes])
        print('\n', indes)
        print('\n', sampled_word_index)
    
    sampled_word = tokenizer.sequences_to_texts([[sampled_word_index]])[0] + ' '
    
    if sampled_word != 'eos ':
        decoded_translation += sampled_word  

    if sampled_word == 'eos ' or len(decoded_translation.split()) > 50:
        stop_condition = True 

    empty_target_seq = np.zeros( ( 1 , 1 ) )  
    empty_target_seq[ 0 , 0 ] = sampled_word_index
    ## <SOS> - > hi
    ## hi --> <EOS>
    test_answer = [h, c]

print("Question: ", test_data['User'].values[7])
print("\nExpected: ", test_data['Company'].values[7])
print("\nGiven: ")
print(decoded_translation)


 ["we're we'd oh thanks i we sorry hi please i'm"]

 39
Question:  @AmazonHelp Just did. Thanks

Expected:  <SOS> @135307 Great. Please keep us informed about the outcome.^PJ <EOS>

Given: 
i'm sorry for the trouble with your order please reach out to us here https t co jzp7hla23b so we can look into this with you https t co jzp7hla23b wt 
