In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
import urllib

# torch.backends.cudnn.deterministic = True

In [None]:
random_seed = 123
torch.manual_seed(random_seed)
vocabulary_size = 20000
lr = 0.005
batch_size = 128
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding_dim = 128
hidden_dim = 256
num_classes = 2

### Downloading dataset
[Dataset Link](https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz)

In [None]:
# dataaset csv disimpan dalam folder `data` dengan lokasi relatif satu level diatas notebook ini
dataset_path = '../data/movie_data.csv' 
df = pd.read_csv(dataset_path)
df.columns = ['text_column', 'label_column']
print(df.shape)
df.head(10)

In [None]:
df.to_csv('../data/movie_data_cleaned.csv', index=False)
del df

### Mempersiapkan Data

**Prasyarat:**
- Paket `spacy` harus sudah terinstall di python anda
- Anda juga perlu mengunduh vocabulary bahasa inggris dari spacy dengan cara mengetikkan perintah di bawah ini pada terminal anda

```bash
python -m spacy download en_core_web_sm
```

**Penjelasan:**
- Versi `torchtext` yang digunakan dalam tutorial ini adalah `0.6.0`. Jika anda ingin menggunakan `torchtext` versi terbaru, silahkan merujuk pada [standar API baru torchtext](https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb#scrollTo=jXUgsnxw70-M)
- Tokenize akan mengubah kalimat pada teks menjadi token. Misalnya : `'Hello world'` menjadi `['Hello', 'world']`
- Detail tentang `torchtext.data` dapat dilihat pada [tautan berikut](https://torchtext.readthedocs.io/en/latest/data.html)

In [None]:
text = torchtext.data.Field(
    tokenize = 'spacy',
    tokenizer_language='en_core_web_sm',
)

label = torchtext.data.LabelField(dtype=torch.long)

fields = [('text_column', text), ('label_column', label)]

dataset = torchtext.data.TabularDataset(
    path='../data/movie_data_cleaned.csv',
    format='csv',
    skip_header=True,
    fields=fields,
)

print(vars(dataset[0]))

In [None]:
# train_data, test_data, val_data = random_split(
#     dataset,
#     [int(len(dataset) * 0.7), int(len(dataset) * 0.2), int(len(dataset) * 0.1)],
#     torch.Generator().manual_seed(random_seed),
# )

train_data, val_data, test_data = dataset.split(
    split_ratio=[0.7, 0.2, 0.1],
    random_state = random.seed(random_seed),
)

print(f'Train data size: {len(train_data)}')
print(f'Test data size: {len(test_data)}')
print(f'Validation data size: {len(val_data)}')

# Mengecek contoh train_data
print(vars(train_data[0]))

### Membangun Vocabulary / Kamus Kata
- Vocabulary dibatasi sebesar 20000 (hanya menampilkan 20000 kata yang paling sering dipakai)

In [None]:
text.build_vocab(train_data, max_size=vocabulary_size)
label.build_vocab(train_data)

print(f'Vocabulary size: {len(text.vocab)}')
print(f'Label size: {len(label.vocab)}')

In [None]:
# kata yang paling banyak muncul
print(text.vocab.freqs.most_common(20))

# 10 entri pertama (integer to string)
print(text.vocab.itos[:10])

# stoi : string to integer
print(text.vocab.stoi['and'])

# Label '1' atau positif ada di index 0, sementara label '0' atau negatif ada di indeks 1
print(label.vocab.stoi)

### Data Loader

In [None]:
train_loader, val_loader, test_loader = torchtext.data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=batch_size,
    sort_within_batch=False,
    sort_key=lambda x: len(x.text_column),
    device=device,
)

check_iter = iter(train_loader)
print(next(check_iter))
print(next(check_iter))

check_iter = iter(val_loader)
print(next(check_iter))

check_iter = iter(test_loader)
print(next(check_iter))

### Model

In [None]:
class RNN(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, output_size):
        super().__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_dim)
        # self.rnn = nn.RNN(embedding_dim, hidden_dim, nonlinearity='relu')
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, text):
        output = self.embedding(text)
        output, (hidden, cell) = self.rnn(output)
        hidden.squeeze_()
        final_output = self.fc(hidden)
        return final_output

In [None]:
model = RNN(
    input_size=len(text.vocab),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_size=num_classes
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [None]:
train_start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.text_column.to(device)
        labels = batch_data.label_column.to(device)
        
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print(f'Epoch: {epoch} | Batch: {batch_idx}/{len(train_loader)} | Loss: {loss:.4f}')
            
    
    model.eval()
    with torch.no_grad():
        
        prediksi_benar = 0
        jumlah_example = 0
        
        for batch_idx, batch_data in enumerate(val_loader):
            
            text = batch_data.text_column.to(device)
            labels = batch_data.label_column.to(device)
            
            logits = model(text)
            _, preds = torch.max(logits, 1)
            
            jumlah_example += len(preds)
            prediksi_benar += (preds == labels).sum().item()
            
        print(f'Epoch: {epoch} | Accuracy: {prediksi_benar / jumlah_example}')

print(f'Train time: {time.time() - train_start_time}')