In [63]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping

In [113]:
#df = pd.read_csv("dataset/ask_play_station.csv")
df = pd.read_csv("dataset/mixed_data_preprocessed.csv")
df.dropna(inplace = True)
df

Unnamed: 0,User,Company
0,Way to drop the ball on customer service @1158...,@115820 I'm sorry we've let you down! Without ...
1,@AmazonHelp 3 different people have given 3 di...,@115820 We'd like to take a further look into ...
2,@115823 I want my amazon payments account CLOS...,@115822 I am unable to affect your account via...
3,@115828 How about you guys figure out my Xbox ...,@115826 I'm sorry for the wait. You'll receive...
4,@AmazonHelp @115826 Yeah this is crazy we’re l...,@115827 Thanks for your patience. ^KM
...,...,...
19299,@ChipotleTweets Fort Worth off heritage Trace :(,@157688 I'm reaching out to this location's le...
19300,@ChipotleTweets Slow down! Half/half rice shou...,@157689 Sorry for the trouble. What location w...
19301,"@ChipotleTweets 818 Howe St, Vancouver, BC. 12...",@157689 I'm sharing your concerns with their l...
19302,@ChipotleTweets I just got home from chipotle ...,@157690 I'd be disappointed too. Let a manager...


In [114]:
for index in df.index:
    df.loc[index,'Company'] = '<SOS> ' + df.loc[index,'Company'] + ' <EOS>'
df

Unnamed: 0,User,Company
0,Way to drop the ball on customer service @1158...,<SOS> @115820 I'm sorry we've let you down! Wi...
1,@AmazonHelp 3 different people have given 3 di...,<SOS> @115820 We'd like to take a further look...
2,@115823 I want my amazon payments account CLOS...,<SOS> @115822 I am unable to affect your accou...
3,@115828 How about you guys figure out my Xbox ...,<SOS> @115826 I'm sorry for the wait. You'll r...
4,@AmazonHelp @115826 Yeah this is crazy we’re l...,<SOS> @115827 Thanks for your patience. ^KM <EOS>
...,...,...
19299,@ChipotleTweets Fort Worth off heritage Trace :(,<SOS> @157688 I'm reaching out to this locatio...
19300,@ChipotleTweets Slow down! Half/half rice shou...,<SOS> @157689 Sorry for the trouble. What loca...
19301,"@ChipotleTweets 818 Howe St, Vancouver, BC. 12...",<SOS> @157689 I'm sharing your concerns with t...
19302,@ChipotleTweets I just got home from chipotle ...,<SOS> @157690 I'd be disappointed too. Let a m...


In [187]:
train_data, test_data = train_test_split(df, test_size=0.3, random_state=42)

In [188]:
text_data = np.concatenate((train_data['User'].values, train_data['Company'].values))
text_data

array(['@AppleSupport suspect fraud for you to look into #fraud #crime #dontfallforit URL_POSITION',
       '@VirginTrains 1st class loungeoit of action AGAIN. Why am I paying a higher price for tickets when it’s not available.  Pls get it sorted!',
       '@British_Airways And customer service line closes at 2000 which is a tad odd for an international airline.',
       ...,
       '<SOS> @122964 Hi Clare, I have replied to your DM. TY - Chris <EOS>',
       '<SOS> @120613 Please allow us 6-12 hours to get back to you with an update. Appreciate your patience, Ananya. ^JC <EOS>',
       '<SOS> @122546 Really sorry Jimmy, can you give me the barcode from the pack? Liz <EOS>'],
      dtype=object)

In [189]:
MAX_NB_WORDS = 14000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 30379 unique tokens.


In [190]:
word_counts  = tokenizer.word_counts 
rare_words_number = 0 
for index, (key, value) in enumerate(word_counts.items()):
    if (value < 2):
        rare_words_number = rare_words_number + 1
        #print( index, key, value)
rare_words_number

17462

In [191]:
train_data['User'].values[0]

'@AppleSupport suspect fraud for you to look into #fraud #crime #dontfallforit URL_POSITION'

In [192]:
seq = tokenizer.texts_to_sequences([train_data['User'].values[0]])
seq

[[151, 4711, 2053, 9, 5, 1, 61, 76, 2053, 6372, 12918, 13, 12]]

In [193]:
tokenizer.sequences_to_texts(seq)

['applesupport suspect fraud for you to look into fraud crime dontfallforit url position']

In [194]:
train_X = tokenizer.texts_to_sequences(train_data['User'].values)
train_y = tokenizer.texts_to_sequences(train_data['Company'].values)

In [195]:
train_X[0]

[151, 4711, 2053, 9, 5, 1, 61, 76, 2053, 6372, 12918, 13, 12]

In [196]:
MAX_QUESTION_LENGTH = 15
#for el in df['User']:
 #   if(len(el) > MAX_QUESTION_LENGTH):
#        MAX_QUESTION_LENGTH = len(el)
print(MAX_QUESTION_LENGTH)

MAX_ANSWER_LENGTH = 15
#for el in df['Company']:
    #if(len(el) > MAX_ANSWER_LENGTH):
        #MAX_ANSWER_LENGTH = len(el)
print(MAX_ANSWER_LENGTH)

15
15


In [197]:
train_X = pad_sequences(train_X, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post') #truncating dodati ako budem hteo avrage da koristim
train_y = pad_sequences(train_y, maxlen=MAX_ANSWER_LENGTH, padding='post', truncating='post')

In [198]:
len(train_X[0])

15

In [199]:
train_X[0][:50]

array([  151,  4711,  2053,     9,     5,     1,    61,    76,  2053,
        6372, 12918,    13,    12,     0,     0])

In [200]:
train_y_final_output = []
for i in train_y:
    train_y_final_output.append(i[1:]) 

train_y_final_output = pad_sequences(train_y_final_output, MAX_ANSWER_LENGTH, padding='post', truncating='post')

In [201]:
train_y_final_output[0][:50]

array([10690,    16,    14,    33,    11,   565,    64,     6,  5014,
        2215,    89,    86,     1,  1811,     0])

In [202]:
train_y[0][:50]

array([    3, 10690,    16,    14,    33,    11,   565,    64,     6,
        5014,  2215,    89,    86,     1,  1811])

In [203]:
from tensorflow.keras.utils import to_categorical
train_y_final_output = to_categorical(train_y_final_output, MAX_NB_WORDS)

In [205]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input


enc_inp = Input(shape=(15, ))
dec_inp = Input(shape=(15, ))

In [207]:
VOCAB_SIZE = MAX_NB_WORDS
embed = Embedding(VOCAB_SIZE+1, output_dim=15, 
                  input_length=15,
                  trainable=True                  
                  )

In [208]:
enc_embed = embed(enc_inp)
enc_lstm = LSTM(200, return_sequences=True, return_state=True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]


In [209]:
embed2 = Embedding(VOCAB_SIZE+1, output_dim=15, 
                  input_length=15,
                  trainable=True                  
                  )

In [210]:
dec_embed = embed2(dec_inp)
dec_lstm = LSTM(200, return_sequences=True, return_state=True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

In [211]:
dense = Dense(VOCAB_SIZE, activation='softmax')
dense_op = dense(dec_op)
model = Model([enc_inp, dec_inp], dense_op)
model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')
print(model.summary())

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_23 (InputLayer)          [(None, 15)]         0           []                               
                                                                                                  
 input_24 (InputLayer)          [(None, 15)]         0           []                               
                                                                                                  
 embedding_7 (Embedding)        (None, 15, 15)       210015      ['input_23[0][0]']               
                                                                                                  
 embedding_8 (Embedding)        (None, 15, 15)       210015      ['input_24[0][0]']               
                                                                                           

In [212]:
model.fit([train_X, train_y],train_y_final_output,epochs=5, validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x218cc788880>

In [213]:
enc_model = Model(enc_inp, enc_states)

In [214]:
decoder_state_input_h = Input(shape=(200,))
decoder_state_input_c = Input(shape=(200,))

In [215]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [216]:
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , 
                                    initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
#decoder_outputs = dense(decoder_outputs)

In [217]:
dec_model = Model([dec_inp]+ decoder_states_inputs,
                                      [decoder_outputs]+ decoder_states)

In [224]:
test_question = tokenizer.texts_to_sequences([test_data['User'].values[100]])

#test_question = tokenizer.texts_to_sequences([train_data['User'].values[0]])
test_question

[[113,
  7,
  3289,
  301,
  134,
  1200,
  37,
  11,
  74,
  752,
  196,
  38,
  17,
  6,
  1463,
  2175,
  778,
  822,
  1,
  1409,
  82,
  15,
  1009,
  685,
  235,
  446,
  6,
  5617,
  183,
  636,
  243,
  27,
  337,
  499,
  42,
  532,
  21,
  196,
  1045,
  813,
  1009,
  313]]

In [225]:
test_X = pad_sequences(test_question, maxlen=MAX_QUESTION_LENGTH, padding='post', truncating='post')
test_X

array([[ 113,    7, 3289,  301,  134, 1200,   37,   11,   74,  752,  196,
          38,   17,    6, 1463]])

In [226]:
test_answer = enc_model.predict( test_X )

In [227]:
empty_target_seq = np.zeros( ( 1 , 1) )
empty_target_seq

array([[0.]])

In [228]:
empty_target_seq[0, 0] = tokenizer.texts_to_sequences(['sos'])[0][0]
empty_target_seq

array([[3.]])

In [229]:
stop_condition = False
decoded_translation=''
i = 0
while not stop_condition :
    dec_outputs , h, c= dec_model.predict([empty_target_seq] + test_answer )
    decoder_concat_input = dense(dec_outputs)
    sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
    
    if i == 0:
        i = 1
        indes = np.argpartition(decoder_concat_input[0, -1, :], -10)[-10:]
        indes = tokenizer.sequences_to_texts([indes])
        print('\n', indes)
        print('\n', sampled_word_index)
    
    sampled_word = tokenizer.sequences_to_texts([[sampled_word_index]])[0] + ' '
    print(decoded_translation)
    if sampled_word != 'eos ':
        decoded_translation += sampled_word  

    if sampled_word == 'eos ' or len(decoded_translation.split()) > 15:
        stop_condition = True 

    empty_target_seq = np.zeros( ( 1 , 1 ) )  
    empty_target_seq[ 0 , 0 ] = sampled_word_index
    ## <SOS> - > hi
    ## hi --> <EOS>
    test_answer = [h, c]

print("Question: ", test_data['User'].values[100])
print("\nExpected: ", test_data['Company'].values[100])
print("\nGiven: ")
print(decoded_translation)


 ["here that's thanks i hello sorry hey we're we hi"]

 36

hi 
hi there 
hi there we 
hi there we can 
hi there we can you 
hi there we can you dm 
hi there we can you dm us 
hi there we can you dm us your 
hi there we can you dm us your account's 
hi there we can you dm us your account's email 
hi there we can you dm us your account's email address 
hi there we can you dm us your account's email address and 
hi there we can you dm us your account's email address and can 
hi there we can you dm us your account's email address and can you 
hi there we can you dm us your account's email address and can you  
hi there we can you dm us your account's email address and can you   
hi there we can you dm us your account's email address and can you    
hi there we can you dm us your account's email address and can you     
hi there we can you dm us your account's email address and can you      
hi there we can you dm us your account's email address and can you       
hi there we can you dm u

hi there we can you dm us your account's email address and can you                                                                                
hi there we can you dm us your account's email address and can you                                                                                 
hi there we can you dm us your account's email address and can you                                                                                  
hi there we can you dm us your account's email address and can you                                                                                   
hi there we can you dm us your account's email address and can you                                                                                    
hi there we can you dm us your account's email address and can you                                                                                     
hi there we can you dm us your account's email address and can you                                     

hi there we can you dm us your account's email address and can you                                                                                                                                     
hi there we can you dm us your account's email address and can you                                                                                                                                      
hi there we can you dm us your account's email address and can you                                                                                                                                       
hi there we can you dm us your account's email address and can you                                                                                                                                        
hi there we can you dm us your account's email address and can you                                                                                                                                

hi there we can you dm us your account's email address and can you                                                                                                                                                                               
hi there we can you dm us your account's email address and can you                                                                                                                                                                                
hi there we can you dm us your account's email address and can you                                                                                                                                                                                 
hi there we can you dm us your account's email address and can you                                                                                                                                                                                  
hi there we can you dm us 

KeyboardInterrupt: 