<a href="https://colab.research.google.com/github/mdzikrim/DeepLearning/blob/main/RNN_ReviewTokoBaju.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk scikit-learn matplotlib



#PyTorch

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import nltk
import re
from nltk.tokenize import word_tokenize
from collections import Counter
from tqdm import tqdm

nltk.download('punkt_tab')

nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv("/content/ReviewTokoBaju.csv")

df = df[['Review Text', 'Recommended IND']].dropna()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

df['tokens'] = df['Review Text'].apply(clean_text)

all_words = [word for tokens in df['tokens'] for word in tokens]
vocab = Counter(all_words)
vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common(10000))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode_tokens(tokens, vocab, max_len=100):
    encoded = [vocab.get(word, vocab['<UNK>']) for word in tokens]
    if len(encoded) < max_len:
        encoded += [vocab['<PAD>']] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return encoded

df['encoded'] = df['tokens'].apply(lambda x: encode_tokens(x, vocab))

X = np.array(df['encoded'].tolist())
y = df['Recommended IND'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = torch.tensor(reviews, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]

batch_size = 64

train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [6]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden.squeeze(0))
        return self.sigmoid(out)

vocab_size = len(vocab)
model = RNNClassifier(vocab_size, embed_dim=128, hidden_dim=128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RNNClassifier(
  (embedding): Embedding(10002, 128, padding_idx=0)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [7]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses, test_losses = [], []
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in tqdm(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    test_losses.append(test_loss / len(test_loader))

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}")


100%|██████████| 283/283 [00:01<00:00, 162.96it/s]


Epoch 1/5, Train Loss: 0.4812, Test Loss: 0.4737


100%|██████████| 283/283 [00:00<00:00, 354.80it/s]


Epoch 2/5, Train Loss: 0.4722, Test Loss: 0.4737


100%|██████████| 283/283 [00:00<00:00, 353.16it/s]


Epoch 3/5, Train Loss: 0.4674, Test Loss: 0.4901


100%|██████████| 283/283 [00:00<00:00, 298.40it/s]


Epoch 4/5, Train Loss: 0.4602, Test Loss: 0.4776


100%|██████████| 283/283 [00:01<00:00, 274.41it/s]


Epoch 5/5, Train Loss: 0.4561, Test Loss: 0.4880


In [8]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs).squeeze().cpu().numpy()
        all_preds.extend(outputs)
        all_labels.extend(labels.numpy())

pred_binary = [1 if p >= 0.5 else 0 for p in all_preds]

acc = accuracy_score(all_labels, pred_binary)
prec = precision_score(all_labels, pred_binary)
rec = recall_score(all_labels, pred_binary)
f1 = f1_score(all_labels, pred_binary)
auc = roc_auc_score(all_labels, all_preds)
fpr, tpr, _ = roc_curve(all_labels, all_preds)

print(f"Akurasi  : {acc:.4f}")
print(f"Presisi  : {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-Score : {f1:.4f}")
print(f"AUC      : {auc:.4f}")


Akurasi  : 0.8101
Presisi  : 0.8247
Recall   : 0.9761
F1-Score : 0.8940
AUC      : 0.5134


#Tensorflow

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# Load data
df = pd.read_csv("/content/ReviewTokoBaju.csv")
df = df[['Review Text', 'Recommended IND']].dropna()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

df['tokens'] = df['Review Text'].apply(clean_text)

all_words = [word for tokens in df['tokens'] for word in tokens]
vocab = Counter(all_words)
vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common(10000))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode_tokens(tokens, vocab, max_len=100):
    encoded = [vocab.get(word, vocab['<UNK>']) for word in tokens]
    if len(encoded) < max_len:
        encoded += [vocab['<PAD>']] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return encoded

df['encoded'] = df['tokens'].apply(lambda x: encode_tokens(x, vocab))

X = np.array(df['encoded'].tolist())
y = df['Recommended IND'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
max_len = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len),
    SimpleRNN(hidden_dim),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()




In [12]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=64
)


Epoch 1/5
[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.8111 - loss: 0.4854 - val_accuracy: 0.8207 - val_loss: 0.4724
Epoch 2/5
[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.8201 - loss: 0.4671 - val_accuracy: 0.8207 - val_loss: 0.4705
Epoch 3/5
[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8213 - loss: 0.4723 - val_accuracy: 0.8207 - val_loss: 0.4736
Epoch 4/5
[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.8167 - loss: 0.4688 - val_accuracy: 0.8159 - val_loss: 0.4869
Epoch 5/5
[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8303 - loss: 0.4381 - val_accuracy: 0.8064 - val_loss: 0.4995


In [13]:
y_pred_probs = model.predict(X_test).flatten()
y_pred = (y_pred_probs >= 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_probs)
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)

print(f"Akurasi  : {acc:.4f}")
print(f"Presisi  : {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-Score : {f1:.4f}")
print(f"AUC      : {auc:.4f}")


[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
Akurasi  : 0.8064
Presisi  : 0.8208
Recall   : 0.9774
F1-Score : 0.8923
AUC      : 0.4890
