<a href="https://colab.research.google.com/github/mdzikrim/DeepLearning/blob/main/IMDB_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install tensorflow_datasets
!pip install scikit-learn



In [1]:
!pip uninstall torch
!pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cpu

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Would remove:
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.11/dist-packages/functorch/*
    /usr/local/lib/python3.11/dist-packages/torch-2.6.0+cu124.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torch/*
    /usr/local/lib/python3.11/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.1.2
  Downloading https://download.pytorch.org/whl/cpu/torch-2.1.2%2Bcpu-cp311-cp311-linux_x86_64.whl (184.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.9/184.9 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.21.0

In [1]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
train_data, test_data = data['train'], data['test']

# Convert ke list (agar bisa diproses keras tokenizer)
train_sentences, train_labels = [], []
test_sentences, test_labels = [], []

for text, label in tfds.as_numpy(train_data):
    train_sentences.append(text.decode("utf-8"))
    train_labels.append(label)

for text, label in tfds.as_numpy(test_data):
    test_sentences.append(text.decode("utf-8"))
    test_labels.append(label)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.7VEBNY_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.7VEBNY_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.7VEBNY_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [3]:
# Tokenizer dan konversi ke integer
tokenizer = Tokenizer(num_words=40000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

maxlen = 400

X_train_seq = tokenizer.texts_to_sequences(train_sentences)
X_test_seq = tokenizer.texts_to_sequences(test_sentences)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding="post", truncating="post")

y_train_tensor = torch.tensor(train_labels, dtype=torch.long)
y_test_tensor = torch.tensor(test_labels, dtype=torch.long)


In [4]:
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [5]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
        self.rnn.flatten_parameters = lambda: None
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim*2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embedding(x)
        out, (hidden, _) = self.rnn(x)
        out = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNModel(vocab_size=40000, embedding_dim=128, hidden_dim=128).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())


In [6]:
train_losses, train_accs = [], []

for epoch in range(5):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.float().to(device)
        optimizer.zero_grad()
        output = model(x_batch).squeeze()
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = (output > 0.5).float()
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

    acc = correct / total
    train_losses.append(total_loss)
    train_accs.append(acc)
    print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Acc={acc:.4f}")


  result = _VF.lstm(


Epoch 1: Loss=256.1545, Acc=0.6059
Epoch 2: Loss=229.9846, Acc=0.6870
Epoch 3: Loss=160.7929, Acc=0.8186
Epoch 4: Loss=113.4847, Acc=0.8805
Epoch 5: Loss=89.6992, Acc=0.9119


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import torch

# Prediksi dengan model
model.eval()

y_pred_probs_all = []  # Store predictions for all batches
y_true_all = []  # Store ground truth for all batches

with torch.no_grad():
    for x_batch, y_batch in test_loader:  # Iterate through test data in batches
        x_batch = x_batch.to(device)
        y_pred_probs_batch = model(x_batch).squeeze().cpu()
        y_pred_probs_all.extend(y_pred_probs_batch.tolist())  # Append batch predictions
        y_true_all.extend(y_batch.tolist())  # Append batch ground truth

# Convert to tensors for metric calculation
y_pred_probs = torch.tensor(y_pred_probs_all)
y_pred_labels = (y_pred_probs >= 0.5).int()
y_true = torch.tensor(y_true_all)

# Hitung metrik evaluasi
accuracy = accuracy_score(y_true, y_pred_labels)
precision = precision_score(y_true, y_pred_labels)
recall = recall_score(y_true, y_pred_labels)
f1 = f1_score(y_true, y_pred_labels)
auc = roc_auc_score(y_true, y_pred_probs)

# Cetak hasil
print(f"Akurasi   : {accuracy:.4f}")
print(f"Presisi   : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-Score  : {f1:.4f}")
print(f"AUC       : {auc:.4f}")

  result = _VF.lstm(


Akurasi   : 0.8483
Presisi   : 0.8908
Recall    : 0.7939
F1-Score  : 0.8396
AUC       : 0.9319
