In [1]:
import keras
print(keras.__version__)

Using TensorFlow backend.


2.2.4


In [2]:
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,  Dropout
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [4]:
dataset = pd.read_csv('ecommercereviews.csv')
dataset.head()

Unnamed: 0,Rating,Review
0,1,3 yıldır tık demedi. :)
1,1,3 yıldır kullanıyorum müthiş
2,1,Ürün bugün elime geçti çok fazla inceleme fırs...
3,1,Almaya karar verdim. Hemencecik geldi. Keyifle...
4,1,Günlük kullanımınızı çok çok iyi karsılıyor kı...


In [5]:
# get all labels and reviews as a list
target = dataset['Rating'].values.tolist()
data = dataset['Review'].values.tolist()

In [6]:
seperation = int(len(data) * 0.80)
x_train, x_test = data[:seperation], data[seperation:]
y_train, y_test = target[:seperation], target[seperation:]

In [7]:
dataset.shape

(243497, 2)

In [9]:
# We will get 10000 most frequently used words in our dataset
num_words = 10000

# Define tokenizer with Keras...
# If we don't define num_words, then we use all words in our dataset.
tokenizer = Tokenizer(num_words=num_words)

In [10]:
# Now tokenize the data
tokenizer.fit_on_texts(data)

In [11]:
# saving tokenizer
import pickle

with open('turkish_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# load tokenizer
with open('turkish_tokenizer.pickle', 'rb') as handle:
    turkish_tokenizer = pickle.load(handle)

In [13]:
"""
Tokenize our training data. This will only works for the words in our 10000 tokenizer. 
If a word is not in 10000 tokenized words, it will be ignored.
"""
x_train_tokens = turkish_tokenizer.texts_to_sequences(x_train)

In [14]:
x_train[100]

'Bu fiyata bu kalite kaçırmayın derim '

In [15]:
x_train_tokens[100]

[5, 39, 5, 131, 323, 143]

In [16]:
x_test_tokens = turkish_tokenizer.texts_to_sequences(x_test)

In [17]:
"""
We will add Padding for our comments.
In RNN, we give predefined-sized inputs. But our comments consist of different sized inputs, so we need to define
a input size for comments. If size > comment, then add 0s for the gap, otherwise trim the comment.
"""

# How many tokens in each comment?
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]

# Convert list to numpy array
num_tokens = np.array(num_tokens)
num_tokens.shape

(243497,)

In [18]:
# In average, how many tokens in one comment?
np.mean(num_tokens)

20.744703220162876

In [19]:
# Max token amount?
np.max(num_tokens)

295

In [20]:
# Index of max token
np.argmax(num_tokens)

21941

In [21]:
# Define max tokens for all comments
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens) #returns float
max_tokens = int(max_tokens)
max_tokens

59

In [22]:
# How many tokens are smaller than max_tokens?
np.sum(num_tokens < max_tokens) / len(num_tokens) * 100  # output: 96%. which means we will only lose info in 4%.

95.97982726686571

In [23]:
# Let's add paddings... So, all datas will be in the same size.
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens)

In [24]:
# Check the sizes
print(x_train_pad.shape)
print(x_test_pad.shape)

(194797, 59)
(48700, 59)


In [25]:
# In Keras, we can get tokens from strings but not vice versa.
# So we will write a function to get strings from tokens
idx = turkish_tokenizer.word_index

# in idx, key value pair like 'çok': 1. But we want it reverse.
inverse_map = dict(zip(idx.values(), idx.keys()))
first_five = {k: inverse_map[k] for k in sorted(inverse_map.keys())[:5]}
first_five

{1: 'çok', 2: 'bir', 3: 've', 4: 'ürün', 5: 'bu'}

In [26]:
def convert_tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = ' '.join(words)
    return text

In [27]:
model = Sequential()

In [28]:
embedding_size = 50 # vector with 50 size for every word

In [29]:
"""
Now we will create an embedding layer in Keras.
We won't use word2vec or glove, instead we create word vectors randomly.
"""

# Add embedding layer to our model.
# embedding matris size = num_words * embedding_size -> 10.000 * 50
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='embedding_layer'))

Instructions for updating:
Colocations handled automatically by placer.


In [30]:
# 3-layered LSTM
model.add(LSTM(units=16, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=8, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=4, return_sequences=False))
model.add(Dropout(0.2))
# Dense layer: aka fully connected layer. Consists of one neuron.
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [31]:
from tensorflow.python.keras.optimizers import Adam
# Adam optimizer
optimizer = Adam(lr=1e-3)

In [32]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])

In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 59, 50)            500000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 59, 16)            4288      
_________________________________________________________________
dropout_1 (Dropout)          (None, 59, 16)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 59, 8)             800       
_________________________________________________________________
dropout_2 (Dropout)          (None, 59, 8)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dropout_3 (Dropout)          (None, 4)                 0         
__________

In [34]:
# epoch -> how many times we are going to train our data.
# batch_size -> feeding size
model.fit(x_train_pad, y_train, epochs=10, batch_size=256)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20db9ec4860>

In [35]:
result = model.evaluate(x_test_pad, y_test)
result



[0.20415101394971386, 0.949034907597536]

In [32]:
accuracy = (result[1]) * 100
accuracy

95.3264887063655

In [33]:
# first we will save the model
model.save('lstm_nlp1.h5')

In [35]:
text1 = "büyük bir hayal kırıklığı yaşadım bu ürün bu markaya yakışmamış"
text2 = "tasarımı harika ancak kargo çok geç geldi ve ürün açılmıştı tavsiye etmem :("
text3 = "hiç resimde gösterildiği gibi değil..."
text4 = "kötü yorumlar gözümü korkutmuştu ancak hiçbir sorun yaşamadım teşekkürler"
text5 = "hiç bu kadar kötü bir satıcıya denk gelmemiştim. ürünü iade ediyorum"
text6 = "tam bir fiyat performans ürünü"
text7 = "güzel bir ürün değil"
texts = [text1, text2,text3,text4,text5,text6,text7]

In [36]:
tokens = turkish_tokenizer.texts_to_sequences(texts)

In [37]:
# tokens
# tokenizer1 = Tokenizer(num_words=59)
tokens = turkish_tokenizer.texts_to_sequences(texts)
tokens

[[104, 2, 1032, 2333, 1466, 5, 4, 5, 1779],
 [553, 61, 82, 27, 1, 458, 33, 3, 4, 9, 1031],
 [46, 1096, 6419, 20, 50],
 [177, 735, 7728, 82, 263, 105, 326, 16],
 [46, 5, 30, 177, 2, 1717, 1244, 19, 677, 83],
 [74, 2, 28, 111, 19],
 [7, 2, 4, 50]]

In [38]:
# tokenizer1 = Tokenizer(num_words=10000)
# tokenizer1.fit_on_texts(data)
# tokens = tokenizer1.texts_to_sequences(texts)
# tokens

In [39]:
tokens_pad = pad_sequences(tokens, maxlen=max_tokens)

In [40]:
model.predict(tokens_pad)

array([[0.01560246],
       [0.47864303],
       [0.00632283],
       [0.9986553 ],
       [0.00390736],
       [0.99942845],
       [0.932442  ]], dtype=float32)

In [41]:
def convert(texts):
    tokens = turkish_tokenizer.texts_to_sequences(texts)
    tokens_pad = pad_sequences(tokens, maxlen=max_tokens)
    return model.predict(tokens_pad)[0][0]

In [42]:
convert(texts)

0.015602457