In [1]:
! unzip movie_sentences.zip

Archive:  movie_sentences.zip
  inflating: imdb_indonesian_movies_2.csv  


In [2]:
import pandas as pd

df = pd.read_csv('imdb_indonesian_movies_2.csv')
df.head()

Unnamed: 0,judul_film,ringkasan_sinopsis,genre
0,Sunan Kalijaga,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Gie,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL Movie,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Sang pencerah,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [3]:
df = df.drop(columns='judul_film', axis=1)
df.head()

Unnamed: 0,ringkasan_sinopsis,genre
0,Raden Mas Said putra sulung Tumenggung Wilarik...,Drama
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,Drama
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,Drama
3,POL menceritakan kisah hidup yang luar biasa d...,Drama
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,Drama


In [5]:
category = pd.get_dummies(df.genre)
df_new = pd.concat([df, category], axis=1)
df_new = df_new.drop(columns='genre', axis=1)
df_new

Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,Raden Mas Said putra sulung Tumenggung Wilarik...,1,0,0,0,0
1,Soe Hok Gie adalah seorang aktivis yang hidup ...,1,0,0,0,0
2,Guru Bangsa Tjokroaminoto menceritakan tentang...,1,0,0,0,0
3,POL menceritakan kisah hidup yang luar biasa d...,1,0,0,0,0
4,Perjalanan pahlawan Indonesia KH Ahmad Dahlan ...,1,0,0,0,0
...,...,...,...,...,...,...
1000,Winter in Tokyo berpusat pada kehidupan Ishida...,0,0,0,0,1
1001,Markonah melarikan diri ke Jakarta karena akan...,0,0,0,0,1
1002,"Tempat aking lebih dari 36 jam, Last Night ada...",0,0,0,0,1
1003,Proyek baru ini adalah tentang seorang lelaki ...,0,0,0,0,1


In [6]:
df_new['ringkasan_sinopsis'] = df_new['ringkasan_sinopsis'].str.lower()
df_new.head()

Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,raden mas said putra sulung tumenggung wilarik...,1,0,0,0,0
1,soe hok gie adalah seorang aktivis yang hidup ...,1,0,0,0,0
2,guru bangsa tjokroaminoto menceritakan tentang...,1,0,0,0,0
3,pol menceritakan kisah hidup yang luar biasa d...,1,0,0,0,0
4,perjalanan pahlawan indonesia kh ahmad dahlan ...,1,0,0,0,0


In [7]:
import nltk
nltk.download('stopwords')
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

stop_word = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r"\w+")

df_new['ringkasan_sinopsis'] = df_new['ringkasan_sinopsis'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_word)]))
df_new['ringkasan_sinopsis'] = df_new['ringkasan_sinopsis'].map(tokenizer.tokenize)
df_new.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,ringkasan_sinopsis,Drama,Horor,Komedi,Laga,Romantis
0,"[raden, mas, said, putra, sulung, tumenggung, ...",1,0,0,0,0
1,"[soe, hok, gie, adalah, seorang, aktivis, yang...",1,0,0,0,0
2,"[guru, bangsa, tjokroaminoto, menceritakan, te...",1,0,0,0,0
3,"[pol, menceritakan, kisah, hidup, yang, luar, ...",1,0,0,0,0
4,"[perjalanan, pahlawan, indonesia, kh, ahmad, d...",1,0,0,0,0


In [8]:
review = df_new['ringkasan_sinopsis'].values
label = df_new.iloc[:, 1:].values

print(review.shape)
print(label.shape)

(1005,)
(1005, 5)


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(review, label, test_size=0.2)

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
tokenizer = Tokenizer(num_words=10000, oov_token='x')
tokenizer.fit_on_texts(X_train) 
 
sekuens_latih = tokenizer.texts_to_sequences(X_train)
sekuens_test = tokenizer.texts_to_sequences(X_test)
 
padded_latih = pad_sequences(sekuens_latih, padding='post', maxlen=20, truncating='post') 
padded_test = pad_sequences(sekuens_test, padding='post', maxlen=20, truncating='post')

In [11]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=16),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy')>0.8):
      print('\nAkurasi telah mencapai >80%!!')
      self.model.stop_training = True
callbacks = myCallback()

In [13]:
num_epochs = 30
history = model.fit(padded_latih, y_train, epochs=num_epochs, callbacks = [callbacks],
                    validation_data=(padded_test, y_test), verbose=2)

Epoch 1/30
26/26 - 4s - loss: 1.6102 - accuracy: 0.1965 - val_loss: 1.6106 - val_accuracy: 0.1841 - 4s/epoch - 156ms/step
Epoch 2/30
26/26 - 0s - loss: 1.6092 - accuracy: 0.2201 - val_loss: 1.6116 - val_accuracy: 0.1841 - 456ms/epoch - 18ms/step
Epoch 3/30
26/26 - 0s - loss: 1.5959 - accuracy: 0.2475 - val_loss: 1.6788 - val_accuracy: 0.1841 - 436ms/epoch - 17ms/step
Epoch 4/30
26/26 - 0s - loss: 1.3892 - accuracy: 0.3371 - val_loss: 1.6730 - val_accuracy: 0.2338 - 460ms/epoch - 18ms/step
Epoch 5/30
26/26 - 0s - loss: 1.1818 - accuracy: 0.4303 - val_loss: 1.7922 - val_accuracy: 0.2189 - 438ms/epoch - 17ms/step
Epoch 6/30
26/26 - 0s - loss: 1.0249 - accuracy: 0.4988 - val_loss: 2.1310 - val_accuracy: 0.2388 - 442ms/epoch - 17ms/step
Epoch 7/30
26/26 - 0s - loss: 0.7926 - accuracy: 0.6418 - val_loss: 3.0791 - val_accuracy: 0.2239 - 444ms/epoch - 17ms/step
Epoch 8/30
26/26 - 0s - loss: 0.5210 - accuracy: 0.7998 - val_loss: 3.7825 - val_accuracy: 0.2537 - 441ms/epoch - 17ms/step
Epoch 9/30