# Sentiment Analysis
Analisis sentimen (juga dikenal sebagai opinion mining atau emotion AI) adalah penggunaan Natural language processing, analisis teks, linguistik komputasi, dan biometrik untuk mengidentifikasi, mengekstrak, mengukur, dan mempelajari keadaan afektif dan informasi subjektif secara sistematis.

# Movie Review Dataset
data yang digunakan memuat kumpulan data ulasan film IMDB dari keras. Dataset ini berisi 25.000 review dari IMDB yang masing-masing sudah diproses sebelumnya dan diberi label positif atau negatif. Setiap ulasan dikodekan dengan bilangan bulat yang mewakili seberapa umum suatu kata di seluruh kumpulan data. Misalnya, sebuah kata yang dikodekan dengan bilangan bulat 3 berarti kata tersebut adalah kata ke-3 yang paling umum dalam kumpulan data.

# IMPORT DATA



In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [2]:
len(train_data[1])

189

# DATA PROCESSING

In [3]:
train_data=sequence.pad_sequences(train_data,MAXLEN)
test_data=sequence.pad_sequences(test_data,MAXLEN)

In [4]:
len(train_data[1])

250

# MEMBANGUN MODEL

In [5]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2843041 (10.85 MB)
Trainable params: 2843041 (10.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=['accuracy'])
history=model.fit(train_data,train_labels,epochs=10,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
#model.save("lstm_model")
#or
model.save("lstm.h5")

  saving_api.save_model(


In [9]:
new_model = tf.keras.models.load_model('lstm.h5')

# EVALUASI

In [10]:
results=new_model.evaluate(test_data,test_labels)
print(results)

[0.526312530040741, 0.8602799773216248]


In [11]:
results=model.evaluate(test_data,test_labels)
print(results)

[0.526312530040741, 0.8602799773216248]


# MEMBUAT PREDIKSI

In [12]:
word_index=imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [13]:
for i in range(10):
    print(list(word_index.keys())[i],':',list(word_index.values())[i])

fawn : 34701
tsukino : 52006
nunnery : 52007
sonja : 16816
vani : 63951
woods : 1408
spiders : 16115
hanging : 2345
woody : 2289
trawling : 52008


In [14]:
def encode_text(text):
    tokens=keras.preprocessing.text.text_to_word_sequence(text)
    tokens=[word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens],MAXLEN)[0]

In [15]:
text="that movie was good"
encoded=encode_text(text)
print(encoded)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 12 17 13 49]


In [16]:
# Decode function that converts itegers to text

reverse_word_index={value:key for (key,value) in word_index.items()}

def decode_integers(integers):
    PAD=0
    text=""
    for num in integers:
        if num!=PAD:
            text+=reverse_word_index[num] +" "

    return text[:-1]

print(decode_integers(encoded))

that movie was good


In [17]:
def predict(text):
    encoded_text=encode_text(text)
    pred=encoded_text.reshape(1,250) #converting vector to 2d
    result=model.predict(pred)
    print(result[0])

In [18]:
positive_review="That was a good movie, i will definitely watch it again"
predict(positive_review)

negative_review="Don't waste your time watching this movie, so disappointing"
predict(negative_review)

[0.95828027]
[0.37579876]
