<a href="https://colab.research.google.com/github/ozgekokyay/sentiment_analysis_of_hepsiburada_reviews/blob/main/sentiment_analysis_of_hepsiburada_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, GRU, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
!wget https://www.dropbox.com/s/a7z7suwt85llb9a/hepsiburada.csv?dl=1 -O hepsiburada.csv

In [None]:
dataset = pd.read_csv('hepsiburada.csv')
dataset.head()

In [None]:
target = dataset['Rating'].values.tolist()
data = dataset['Review'].values.tolist()


In [None]:
cutoff = int(len(data) * 0.9)
x_train, x_test = data[:cutoff], data[cutoff:]
y_train, y_test = target[:cutoff], target[cutoff:]

In [None]:
num_words = 7000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data)

In [None]:
tokenizer.word_index['iyi']

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_train[1000], x_train_tokens[1000]

In [None]:
x_test_tokens = tokenizer.texts_to_sequences(x_test)
x_test[1000], x_test_tokens[100]

In [None]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
np.mean(num_tokens), np.max(num_tokens), np.argmax(num_tokens)



In [None]:
x_train[21941]

In [None]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

In [None]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

In [None]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens)
x_train_pad.shape, x_test_pad.shape

In [None]:
idx = tokenizer.word_index

inverse_map = dict(zip(idx.values(), idx.keys()))


In [None]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token!=0]
    text = ' '.join(words)
    return text

In [None]:
x_train_tokens[100], tokens_to_string(x_train_tokens[100])

In [None]:
embedding_size = 42
from tensorflow.keras.layers import Dropout, GlobalAveragePooling1D
model = Sequential()

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='embedding_layer'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
# model.add(Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
# model.add(Bidirectional(tf.keras.layers.LSTM(32)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.summary()

In [None]:

optimizer = Adam(lr=1e-3)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=optimizer,
              metrics=['accuracy'])


In [None]:
from tensorflow.data import Dataset
train_ds = Dataset.zip((Dataset.from_tensor_slices(x_train_pad), Dataset.from_tensor_slices(y_train)))
train_ds = train_ds.shuffle(1024).batch(256)
val_ds = Dataset.zip((Dataset.from_tensor_slices(x_test_pad), Dataset.from_tensor_slices(y_test)))
val_ds = val_ds.batch(256)
steps_per_epoch = len(x_train_pad) // 256
validation_steps = len(x_test_pad) // 256

In [None]:
model.fit(train_ds.repeat(), epochs=5, steps_per_epoch=steps_per_epoch, validation_data=val_ds.repeat(), validation_steps=validation_steps)

In [None]:
model.save('model.h5')
!ls

In [None]:
from google.colab import files
files.download('model.h5')

In [None]:
model2 = load_model('model.h5')
model2.summary()

In [None]:
tests = ["Henüz fazla test edemedim ama ürün iyi görünüyor", "Oldukça kullanışlı, uzun süre kullanacağımı düşünüyorum", "Ben hiç beğenmedim, kesinlikle tavsiye etmem", "Bu ürünü uzun süre kullanmak imkansız"]
test_tokens = tokenizer.texts_to_sequences(tests)
test_pad = pad_sequences(test_tokens, maxlen=max_tokens)
test_pad.shape

In [None]:
preds = model2.predict(test_pad)
preds = tf.nn.sigmoid(preds)
preds