In [1]:
import pandas as pd


df = pd.read_csv('drive/MyDrive/yelp_labelled.txt', names = ['sentence', 'label'], delimiter= '\t', header = None)

In [2]:
df.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.isnull().sum()

sentence    0
label       0
dtype: int64

In [4]:
# bagi dataset menjadi data train dan data test

from sklearn.model_selection import train_test_split

sentences = df['sentence'].values
y = df['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size = 0.2)

print('Sentences Train', sentences_train)
print('Sentences Test', sentences_test)
print('y Train', y_train)
print('y Test', y_test)

Sentences Train ['Google mediocre and I imagine Smashburger will pop up.'
 'Food was average at best.'
 "No, I'm going to eat the potato that I found some strangers hair in it."
 "We asked for the bill to leave without eating and they didn't bring that either."
 'The atmosphere here is fun.'
 'Our server was fantastic and when he found out the wife loves roasted garlic and bone marrow, he added extra to our meal and another marrow to go!'
 '5 stars for the brick oven bread app!' 'Avoid at all cost!'
 'For service, I give them no stars.'
 'I had the mac salad and it was pretty bland so I will not be getting that again.'
 'The waitresses are very friendly.' 'We loved the biscuits!!!'
 'My husband said she was very rude... did not even apologize for the bad food or anything.'
 'Waitress was a little slow in service.'
 'Worst service to boot, but that is the least of their worries.'
 'Last night was my second time dining here and I was so happy I decided to go back!'
 'Please stay away fro

In [5]:
# lakukan tokenisasi dan gunakan fungsi tokenizer pada test dan train
# dan gunakan pad_sequences agar sequence sama panjang 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = 250, oov_token = 'x')
tokenizer.fit_on_texts(sentences_train)
tokenizer.fit_on_texts(sentences_test)


seq_train = tokenizer.texts_to_sequences(sentences_train)
seq_test = tokenizer.texts_to_sequences(sentences_test)

padded_train = pad_sequences(seq_train, maxlen = 20)
padded_test = pad_sequences(seq_test, maxlen =20)

In [6]:
#  menggunakan layer embedding, argumen pertama nya adalah jumlah vocab
#  kata yang dipakai pada tokenizer
# argumen berikut adalah dimensi embedding dan input length adalah panjang dari seq
#  tidak menggunakan layer flatten tetapi dengn GlobalAveragePooling1D

import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(250, 16, input_length = 20),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
    
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [7]:
num_epochs = 30
history = model.fit(padded_train, y_train, epochs=num_epochs, 
                    validation_data=(padded_test, y_test), verbose=2)

Epoch 1/30
25/25 - 1s - loss: 0.6928 - accuracy: 0.5050 - val_loss: 0.6943 - val_accuracy: 0.4400 - 793ms/epoch - 32ms/step
Epoch 2/30
25/25 - 0s - loss: 0.6904 - accuracy: 0.5425 - val_loss: 0.6923 - val_accuracy: 0.4800 - 62ms/epoch - 2ms/step
Epoch 3/30
25/25 - 0s - loss: 0.6861 - accuracy: 0.5537 - val_loss: 0.6894 - val_accuracy: 0.5250 - 78ms/epoch - 3ms/step
Epoch 4/30
25/25 - 0s - loss: 0.6785 - accuracy: 0.6137 - val_loss: 0.6814 - val_accuracy: 0.5750 - 63ms/epoch - 3ms/step
Epoch 5/30
25/25 - 0s - loss: 0.6668 - accuracy: 0.6400 - val_loss: 0.6734 - val_accuracy: 0.5750 - 61ms/epoch - 2ms/step
Epoch 6/30
25/25 - 0s - loss: 0.6515 - accuracy: 0.6737 - val_loss: 0.6571 - val_accuracy: 0.6050 - 74ms/epoch - 3ms/step
Epoch 7/30
25/25 - 0s - loss: 0.6293 - accuracy: 0.6963 - val_loss: 0.6362 - val_accuracy: 0.6550 - 69ms/epoch - 3ms/step
Epoch 8/30
25/25 - 0s - loss: 0.6030 - accuracy: 0.7375 - val_loss: 0.6183 - val_accuracy: 0.6450 - 73ms/epoch - 3ms/step
Epoch 9/30
25/25 - 0s 

In [9]:
import tensorflow as tf
model_lstm = tf.keras.Sequential([
    # implementasi lstm 
    tf.keras.layers.Embedding(input_dim = 5000, output_dim = 16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
    
])

model_lstm.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [10]:
num_epochs = 30
history = model_lstm.fit(padded_train, y_train, epochs=num_epochs, 
                    validation_data=(padded_test, y_test), verbose=2)

Epoch 1/30
25/25 - 3s - loss: 0.6927 - accuracy: 0.4975 - val_loss: 0.6967 - val_accuracy: 0.4400 - 3s/epoch - 104ms/step
Epoch 2/30
25/25 - 0s - loss: 0.6845 - accuracy: 0.5713 - val_loss: 0.6885 - val_accuracy: 0.5150 - 274ms/epoch - 11ms/step
Epoch 3/30
25/25 - 0s - loss: 0.6502 - accuracy: 0.6325 - val_loss: 0.6143 - val_accuracy: 0.7350 - 285ms/epoch - 11ms/step
Epoch 4/30
25/25 - 0s - loss: 0.5886 - accuracy: 0.7250 - val_loss: 0.5767 - val_accuracy: 0.7250 - 274ms/epoch - 11ms/step
Epoch 5/30
25/25 - 0s - loss: 0.4636 - accuracy: 0.8025 - val_loss: 0.5097 - val_accuracy: 0.7550 - 277ms/epoch - 11ms/step
Epoch 6/30
25/25 - 0s - loss: 0.4057 - accuracy: 0.8200 - val_loss: 0.5234 - val_accuracy: 0.7950 - 280ms/epoch - 11ms/step
Epoch 7/30
25/25 - 0s - loss: 0.3431 - accuracy: 0.8413 - val_loss: 0.5276 - val_accuracy: 0.8000 - 298ms/epoch - 12ms/step
Epoch 8/30
25/25 - 0s - loss: 0.3086 - accuracy: 0.8763 - val_loss: 0.5257 - val_accuracy: 0.7950 - 284ms/epoch - 11ms/step
Epoch 9/30