<a href="https://colab.research.google.com/github/phamquocanh149/IMDB_REVIEW/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")
train_data, test_data = imdb['train'], imdb['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import pandas as pd

In [None]:
train = pd.DataFrame(train_data)
test = pd.DataFrame(test_data)

In [None]:
print(train.head())

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0


In [None]:
print(train.shape)

(25000, 2)


In [None]:
print(test.head())

                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...      0
4  First off let me say, If you haven't enjoyed a...      0


In [None]:
print(test.shape)

(25000, 2)


In [None]:
print(train['label'].value_counts())

label
0    12500
1    12500
Name: count, dtype: int64


In [None]:
import spacy
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x7968655cddc0>

In [None]:
pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 torchtext==0.18.0




In [None]:
import torch
from torchtext.transforms import VocabTransform
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer




In [None]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

In [None]:
train['token'] = train['text'].apply(lambda x: tokenizer(x))
test['token'] = test['text'].apply(lambda x: tokenizer(x))

In [None]:
import string
X_train = train['token'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])
X_test = test['token'].apply(lambda tokens: [token for token in tokens if token not in string.punctuation])

In [None]:
print(X_train.head())

0    [I, rented, I, AM, CURIOUS, YELLOW, from, my, ...
1    [I, Am, Curious, Yellow, is, a, risible, and, ...
2    [If, only, to, avoid, making, this, type, of, ...
3    [This, film, was, probably, inspired, by, Goda...
4    [Oh, brother, ..., after, hearing, about, this...
Name: token, dtype: object


In [None]:
X_train = X_train.apply(lambda tokens: [token.lower() for token in tokens])
X_test = X_test.apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
vocab = build_vocab_from_iterator((token for token in X_train), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
print(len(vocab))

101135


In [None]:
print(type(test['label'][0]))

<class 'numpy.int64'>


In [None]:
transform = VocabTransform(vocab)

In [None]:
X_train = X_train.apply(transform)
X_test = X_test.apply(transform)

In [None]:
print(X_train)

0        [10, 1591, 10, 240, 2007, 4182, 38, 63, 373, 1...
1        [10, 240, 2007, 4182, 7, 4, 19726, 3, 1951, 89...
2        [49, 65, 6, 863, 231, 11, 549, 5, 21, 9, 2, 70...
3        [11, 21, 15, 238, 1576, 35, 7436, 13, 81956, 7...
4        [467, 558, 78, 104, 2218, 45, 11, 654, 21, 17,...
                               ...                        
24995    [4, 562, 33, 2, 61, 20, 160, 131, 32062, 16, 3...
24996    [10, 120, 11, 19, 39, 59, 87, 167, 61, 10, 83,...
24997    [11, 21, 3, 8, 13, 771, 3472, 16249, 1752, 28,...
24998    [2, 2486, 5, 3472, 16249, 636, 118, 16, 4, 605...
24999    [2, 66, 4678, 191, 3472, 16249, 37, 208, 142, ...
Name: token, Length: 25000, dtype: object


In [None]:
def encode_sen(encode, pad_index, max_length = 300):
  if len(encode) < max_length:
    encode += [pad_index] * (max_length - len(encode))
  else:
    encode = encode[:max_length]
  return encode

In [None]:
pad_index = vocab['<pad>']
X_train = X_train.apply(lambda x: encode_sen(x, pad_index))
X_test = X_test.apply(lambda x: encode_sen(x, pad_index))

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
  def __init__(self, X, y):
    self.X = torch.tensor(X, dtype=torch.long)
    self.y = torch.tensor(y, dtype=torch.long)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

  def __len__(self):
    return len(self.X)


In [None]:
import numpy as np

train = TextDataset(X_train, np.array(train['label'], dtype=int))
test = TextDataset(X_test, np.array(test['label'], dtype=int))

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_loader = DataLoader(train, batch_size=128, shuffle=True)
test_loader = DataLoader(test, batch_size=128, shuffle=False)

In [None]:
class RNN(torch.nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)

        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim,
                                 batch_first=True)

        self.fc = torch.nn.Linear(hidden_dim, output_dim)


    def forward(self, text):
        # text dim: [batch size, sentence length]

        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]

        output = self.fc(hidden)
        return output

In [None]:
torch.manual_seed(14)
model = RNN(input_dim=101135,
            embedding_dim=32,
            hidden_dim=32,
            output_dim=2 # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
import torch
import torch.nn as nn

criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, valid_loader, optimizer, criterion, device, epochs=5):
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        epoch_loss = 0
        epoch_acc = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            predictions = model(X_batch)       # [batch_size, output_dim]

            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            # tính accuracy
            acc = (predictions.argmax(1) == y_batch).sum().item() / len(y_batch)
            epoch_acc += acc

        avg_loss = epoch_loss / len(train_loader)
        avg_acc = epoch_acc / len(train_loader)

        # Evaluate trên validation set
        model.eval() # Set model to evaluation mode
        val_loss, val_acc = evaluate(model, valid_loader, criterion, device)

        print(f"Epoch {epoch+1}: "
              f"Train Loss={avg_loss:.4f}, Train Acc={avg_acc:.4f} | "
              f"Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

In [None]:

def evaluate(model, loader, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            predictions = model(X_batch)

            loss = criterion(predictions, y_batch)
            epoch_loss += loss.item()

            acc = (predictions.argmax(1) == y_batch).sum().item() / len(y_batch)
            epoch_acc += acc

    avg_loss = epoch_loss / len(loader)
    avg_acc = epoch_acc / len(loader)
    return avg_loss, avg_acc

In [None]:
train_model(model, train_loader, test_loader, optimizer, criterion, DEVICE, epochs=10)


Epoch 1: Train Loss=0.5674, Train Acc=0.6816 | Val Loss=0.5625, Val Acc=0.7638
Epoch 2: Train Loss=0.3522, Train Acc=0.8624 | Val Loss=0.4894, Val Acc=0.8084
Epoch 3: Train Loss=0.2055, Train Acc=0.9314 | Val Loss=0.4761, Val Acc=0.8251
Epoch 4: Train Loss=0.1222, Train Acc=0.9647 | Val Loss=0.6332, Val Acc=0.8040
Epoch 5: Train Loss=0.0765, Train Acc=0.9805 | Val Loss=0.6142, Val Acc=0.8234
Epoch 6: Train Loss=0.0528, Train Acc=0.9874 | Val Loss=0.6723, Val Acc=0.8274
Epoch 7: Train Loss=0.0379, Train Acc=0.9919 | Val Loss=0.7315, Val Acc=0.8252
Epoch 8: Train Loss=0.0264, Train Acc=0.9951 | Val Loss=0.8359, Val Acc=0.8201
Epoch 9: Train Loss=0.0218, Train Acc=0.9960 | Val Loss=0.7955, Val Acc=0.8265
Epoch 10: Train Loss=0.0196, Train Acc=0.9958 | Val Loss=0.8419, Val Acc=0.8247
