<a href="https://colab.research.google.com/github/mdzikrim/DeepLearning/blob/main/IMDB_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
train_data, test_data = data['train'], data['test']

# Convert ke list (agar bisa diproses keras tokenizer)
train_sentences, train_labels = [], []
test_sentences, test_labels = [], []

for text, label in tfds.as_numpy(train_data):
    train_sentences.append(text.decode("utf-8"))
    train_labels.append(label)

for text, label in tfds.as_numpy(test_data):
    test_sentences.append(text.decode("utf-8"))
    test_labels.append(label)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.70TZWH_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.70TZWH_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.70TZWH_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [3]:
# Tokenizer dan konversi ke integer
tokenizer = Tokenizer(num_words=40000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

maxlen = 400

X_train_seq = tokenizer.texts_to_sequences(train_sentences)
X_test_seq = tokenizer.texts_to_sequences(test_sentences)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding="post", truncating="post")

y_train_tensor = torch.tensor(train_labels, dtype=torch.long)
y_test_tensor = torch.tensor(test_labels, dtype=torch.long)


In [4]:
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [5]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
        self.rnn.flatten_parameters = lambda: None
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding(x)
        out, (hidden, _) = self.rnn(x)
        out = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNModel(vocab_size=40000, embedding_dim=128, hidden_dim=128).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())


In [6]:
train_losses, train_accs = [], []

for epoch in range(5):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.float().to(device)
        optimizer.zero_grad()
        output = model(x_batch).squeeze()
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = (output > 0.5).float()
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    acc = correct / total
    train_losses.append(total_loss)
    train_accs.append(acc)
    print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Acc={acc:.4f}")


  result = _VF.lstm(


Epoch 1: Loss=269.7017, Acc=0.5227
Epoch 2: Loss=263.5400, Acc=0.5850
Epoch 3: Loss=229.0172, Acc=0.6927
Epoch 4: Loss=136.3321, Acc=0.8567
Epoch 5: Loss=89.1284, Acc=0.9139


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import torch

# Prediksi dengan model
model.eval()

y_pred_probs_all = []  # Store predictions for all batches
y_true_all = []  # Store ground truth for all batches

with torch.no_grad():
    for x_batch, y_batch in test_loader:  # Iterate through test data in batches
        x_batch = x_batch.to(device)
        y_pred_probs_batch = model(x_batch).squeeze().cpu()
        y_pred_probs_all.extend(y_pred_probs_batch.tolist())  # Append batch predictions
        y_true_all.extend(y_batch.tolist())  # Append batch ground truth

# Convert to tensors for metric calculation
y_pred_probs = torch.tensor(y_pred_probs_all)
y_pred_labels = (y_pred_probs >= 0.5).int()
y_true = torch.tensor(y_true_all)

# Hitung metrik evaluasi
accuracy = accuracy_score(y_true, y_pred_labels)
precision = precision_score(y_true, y_pred_labels)
recall = recall_score(y_true, y_pred_labels)
f1 = f1_score(y_true, y_pred_labels)
auc = roc_auc_score(y_true, y_pred_probs)

# Cetak hasil
print(f"Akurasi   : {accuracy:.4f}")
print(f"Presisi   : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-Score  : {f1:.4f}")
print(f"AUC       : {auc:.4f}")

Akurasi   : 0.8580
Presisi   : 0.8864
Recall    : 0.8213
F1-Score  : 0.8526
AUC       : 0.9368


In [8]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np


In [9]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    as_supervised=True,
    with_info=True
)


In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=30000, oov_token="<OOV>")
train_sentences = []
train_labels = []

for s, l in tfds.as_numpy(train_data):
    train_sentences.append(s.decode('utf-8'))
    train_labels.append(l)

tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=400, padding='post', truncating='post')
train_labels = np.array(train_labels)


In [11]:
test_sentences = []
test_labels = []

for s, l in tfds.as_numpy(test_data):
    test_sentences.append(s.decode('utf-8'))
    test_labels.append(l)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=400, padding='post', truncating='post')
test_labels = np.array(test_labels)


In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(30000, 128, input_length=400),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

model.summary()




In [13]:
history = model.fit(
    train_padded, train_labels,
    epochs=5,
    batch_size=128,
    validation_split=0.2
)


Epoch 1/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 84ms/step - accuracy: 0.6375 - auc: 0.6942 - loss: 0.6066 - precision: 0.6342 - recall: 0.6470 - val_accuracy: 0.8296 - val_auc: 0.8905 - val_loss: 0.4415 - val_precision: 0.8349 - val_recall: 0.8269
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 79ms/step - accuracy: 0.8655 - auc: 0.9224 - loss: 0.3493 - precision: 0.8397 - recall: 0.9040 - val_accuracy: 0.8642 - val_auc: 0.9319 - val_loss: 0.3394 - val_precision: 0.8422 - val_recall: 0.9004
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 79ms/step - accuracy: 0.9310 - auc: 0.9715 - loss: 0.2063 - precision: 0.9285 - recall: 0.9332 - val_accuracy: 0.8460 - val_auc: 0.9253 - val_loss: 0.4196 - val_precision: 0.8000 - val_recall: 0.9277
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 81ms/step - accuracy: 0.9613 - auc: 0.9881 - loss: 0.1231 - precision: 0.9592 - recall: 0.96

In [14]:
# Prediksi
y_pred_prob = model.predict(test_padded).flatten()
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Hitung metrik
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
auc = roc_auc_score(test_labels, y_pred_prob)

# Tampilkan hasil
print("Accuracy :", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall   :", round(recall, 4))
print("F1-Score :", round(f1, 4))
print("ROC AUC  :", round(auc, 4))


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step
Accuracy : 0.8211
Precision: 0.8475
Recall   : 0.783
F1-Score : 0.814
ROC AUC  : 0.9038
