In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
import urllib

# torch.backends.cudnn.deterministic = True

In [3]:
random_seed = 123
torch.manual_seed(random_seed)
vocabulary_size = 20000
lr = 0.005
batch_size = 128
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding_dim = 128
hidden_dim = 256
num_classes = 2

### Downloading dataset
[Dataset Link](https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz)

In [4]:
# dataaset csv disimpan dalam folder `data` dengan lokasi relatif satu level diatas notebook ini
dataset_path = '../data/movie_data.csv' 
df = pd.read_csv(dataset_path)
df.columns = ['text_column', 'label_column']
print(df.shape)
df.head(10)

(50000, 2)


Unnamed: 0,text_column,label_column
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
5,Leave it to Braik to put on a good show. Final...,1
6,Nathan Detroit (Frank Sinatra) is the manager ...,1
7,"To understand ""Crash Course"" in the right cont...",1
8,I've been impressed with Chavez's stance again...,1
9,This movie is directed by Renny Harlin the fin...,1


In [5]:
df.to_csv('../data/movie_data_cleaned.csv', index=False)
del df

### Mempersiapkan Data

**Prasyarat:**
- Paket `spacy` harus sudah terinstall di python anda
- Anda juga perlu mengunduh vocabulary bahasa inggris dari spacy dengan cara mengetikkan perintah di bawah ini pada terminal anda

```bash
python -m spacy download en_core_web_sm
```

**Penjelasan:**
- Versi `torchtext` yang digunakan dalam tutorial ini adalah `0.6.0`. Jika anda ingin menggunakan `torchtext` versi terbaru, silahkan merujuk pada [standar API baru torchtext](https://colab.research.google.com/github/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb#scrollTo=jXUgsnxw70-M)
- Tokenize akan mengubah kalimat pada teks menjadi token. Misalnya : `'Hello world'` menjadi `['Hello', 'world']`
- Detail tentang `torchtext.data` dapat dilihat pada [tautan berikut](https://torchtext.readthedocs.io/en/latest/data.html)

In [6]:
text = torchtext.data.Field(
    tokenize = 'spacy',
    tokenizer_language='en_core_web_sm',
)

label = torchtext.data.LabelField(dtype=torch.long)

fields = [('text_column', text), ('label_column', label)]

dataset = torchtext.data.TabularDataset(
    path='../data/movie_data_cleaned.csv',
    format='csv',
    skip_header=True,
    fields=fields,
)

print(vars(dataset[0]))

{'text_column': ['In', '1974', ',', 'the', 'teenager', 'Martha', 'Moxley', '(', 'Maggie', 'Grace', ')', 'moves', 'to', 'the', 'high', '-', 'class', 'area', 'of', 'Belle', 'Haven', ',', 'Greenwich', ',', 'Connecticut', '.', 'On', 'the', 'Mischief', 'Night', ',', 'eve', 'of', 'Halloween', ',', 'she', 'was', 'murdered', 'in', 'the', 'backyard', 'of', 'her', 'house', 'and', 'her', 'murder', 'remained', 'unsolved', '.', 'Twenty', '-', 'two', 'years', 'later', ',', 'the', 'writer', 'Mark', 'Fuhrman', '(', 'Christopher', 'Meloni', ')', ',', 'who', 'is', 'a', 'former', 'LA', 'detective', 'that', 'has', 'fallen', 'in', 'disgrace', 'for', 'perjury', 'in', 'O.J.', 'Simpson', 'trial', 'and', 'moved', 'to', 'Idaho', ',', 'decides', 'to', 'investigate', 'the', 'case', 'with', 'his', 'partner', 'Stephen', 'Weeks', '(', 'Andrew', 'Mitchell', ')', 'with', 'the', 'purpose', 'of', 'writing', 'a', 'book', '.', 'The', 'locals', 'squirm', 'and', 'do', 'not', 'welcome', 'them', ',', 'but', 'with', 'the', 'su

In [7]:
# train_data, test_data, val_data = random_split(
#     dataset,
#     [int(len(dataset) * 0.7), int(len(dataset) * 0.2), int(len(dataset) * 0.1)],
#     torch.Generator().manual_seed(random_seed),
# )

train_data, val_data, test_data = dataset.split(
    split_ratio=[0.7, 0.2, 0.1],
    random_state = random.seed(random_seed),
)

print(f'Train data size: {len(train_data)}')
print(f'Test data size: {len(test_data)}')
print(f'Validation data size: {len(val_data)}')

# Mengecek contoh train_data
print(vars(train_data[0]))

Train data size: 35000
Test data size: 10000
Validation data size: 5000
{'text_column': ['Pros', ':', 'Nothing', '<', 'br', '/><br', '/>Cons', ':', 'Everything', '<', 'br', '/><br', '/>Plot', 'summary', ':', 'A', 'female', 'reporter', 'runs', 'into', 'a', 'hitchhiker', 'that', 'tells', 'her', 'stories', 'about', 'the', 'deaths', 'of', 'people', 'that', 'were', 'killed', 'by', 'zombies.<br', '/><br', '/>Review', ':', 'Never', 'in', 'my', 'life', 'have', 'I', 'come', 'across', 'a', 'movie', 'as', 'bad', 'The', 'Zombie', 'Chronicles', '.', 'Filmed', 'on', 'a', 'budget', 'of', 'what', 'looks', 'to', 'be', 'about', '20', 'bucks', ',', 'TZC', 'is', 'a', 'completely', 'horrible', 'horror', 'movie', 'that', 'relies', 'on', 'lame', ',', 'forgetable', 'actors', 'whom', 'could', "n't", 'act', 'to', 'save', 'their', 'lives', 'and', 'gore', 'that', "'s", 'more', 'gross', 'than', 'frightening', '.', 'How', 'does', 'a', 'movie', 'like', 'this', 'even', 'get', 'made', '?', 'Simply', 'put', ',', 'avoid

### Membangun Vocabulary / Kamus Kata
- Vocabulary dibatasi sebesar 20000 (hanya menampilkan 20000 kata yang paling sering dipakai)

In [8]:
text.build_vocab(train_data, max_size=vocabulary_size)
label.build_vocab(train_data)

print(f'Vocabulary size: {len(text.vocab)}')
print(f'Label size: {len(label.vocab)}')

Vocabulary size: 20002
Label size: 2


In [9]:
# kata yang paling banyak muncul
print(text.vocab.freqs.most_common(20))

# 10 entri pertama (integer to string)
print(text.vocab.itos[:10])

# stoi : string to integer
print(text.vocab.stoi['and'])

# Label '1' atau positif ada di index 0, sementara label '0' atau negatif ada di indeks 1
print(label.vocab.stoi)

[('the', 402761), (',', 381621), ('.', 328787), ('and', 217039), ('a', 216689), ('of', 200379), ('to', 185267), ('is', 150020), ('in', 122410), ('I', 108843), ('it', 106563), ('that', 96538), ('"', 89116), ("'s", 85279), ('this', 84271), ('-', 73508), ('/><br', 70760), ('was', 69368), ('as', 59751), ('movie', 59142)]
['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
5
defaultdict(None, {'1': 0, '0': 1})


### Data Loader

In [10]:
train_loader, val_loader, test_loader = torchtext.data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=batch_size,
    sort_within_batch=False,
    sort_key=lambda x: len(x.text_column),
    device=device,
)

check_iter = iter(train_loader)
print(next(check_iter))
print(next(check_iter))

check_iter = iter(val_loader)
print(next(check_iter))

check_iter = iter(test_loader)
print(next(check_iter))


[torchtext.data.batch.Batch of size 128]
	[.text_column]:[torch.LongTensor of size 946x128]
	[.label_column]:[torch.LongTensor of size 128]

[torchtext.data.batch.Batch of size 128]
	[.text_column]:[torch.LongTensor of size 1068x128]
	[.label_column]:[torch.LongTensor of size 128]

[torchtext.data.batch.Batch of size 128]
	[.text_column]:[torch.LongTensor of size 60x128]
	[.label_column]:[torch.LongTensor of size 128]

[torchtext.data.batch.Batch of size 128]
	[.text_column]:[torch.LongTensor of size 52x128]
	[.label_column]:[torch.LongTensor of size 128]


### Model

In [11]:
class RNN(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, output_size):
        super().__init__()
        
        self.embedding = nn.Embedding(input_size, embedding_dim)
        # self.rnn = nn.RNN(embedding_dim, hidden_dim, nonlinearity='relu')
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, text):
        output = self.embedding(text)
        output, (hidden, cell) = self.rnn(output)
        hidden.squeeze_()
        final_output = self.fc(hidden)
        return final_output

In [12]:
model = RNN(
    input_size=len(text.vocab),
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_size=num_classes
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [None]:
train_start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.text_column.to(device)
        labels = batch_data.label_column.to(device)
        
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print(f'Epoch: {epoch} | Batch: {batch_idx}/{len(train_loader)} | Loss: {loss:.4f}')
            
    
    model.eval()
    with torch.no_grad():
        
        prediksi_benar = 0
        jumlah_example = 0
        
        for batch_idx, batch_data in enumerate(val_loader):
            
            text = batch_data.text_column.to(device)
            labels = batch_data.label_column.to(device)
            
            logits = model(text)
            _, preds = torch.max(logits, 1)
            
            jumlah_example += len(preds)
            prediksi_benar += (preds == labels).sum().item()
            
        print(f'Epoch: {epoch} | Accuracy: {prediksi_benar / jumlah_example}')

print(f'Train time: {time.time() - train_start_time}')

### Testing

In [15]:
'''
load model dari '../assets/model.pth' (Karena laptop saya tidak kuat nge-train model)
jika anda tidak ingin menggunakan pre-trained model, berikan comment pada line di bawah ini (line 6)
'''

model.load_state_dict(torch.load('../assets/model.pth'))

model.eval()
with torch.no_grad():

    prediksi_benar = 0
    jumlah_example = 0

    for batch_idx, batch_data in enumerate(test_loader):

        text = batch_data.text_column.to(device)
        labels = batch_data.label_column.to(device)

        logits = model(text)
        _, preds = torch.max(logits, 1)

        jumlah_example += len(preds)
        prediksi_benar += (preds == labels).sum().item()

        print(f'Batch: {batch_idx}/{len(test_loader)} | Accuracy: {prediksi_benar / jumlah_example}')

    print(f'Test accuracy: {prediksi_benar / jumlah_example}')

Batch: 0/79 | Accuracy: 0.9140625
Batch: 1/79 | Accuracy: 0.87109375
Batch: 2/79 | Accuracy: 0.859375
Batch: 3/79 | Accuracy: 0.849609375
Batch: 4/79 | Accuracy: 0.8515625
Batch: 5/79 | Accuracy: 0.8463541666666666
Batch: 6/79 | Accuracy: 0.8359375
Batch: 7/79 | Accuracy: 0.8251953125
Batch: 8/79 | Accuracy: 0.8255208333333334
Batch: 9/79 | Accuracy: 0.8171875
Batch: 10/79 | Accuracy: 0.8153409090909091
Batch: 11/79 | Accuracy: 0.814453125
Batch: 12/79 | Accuracy: 0.8167067307692307
Batch: 13/79 | Accuracy: 0.8169642857142857
Batch: 14/79 | Accuracy: 0.81875
Batch: 15/79 | Accuracy: 0.82080078125
Batch: 16/79 | Accuracy: 0.8193933823529411
Batch: 17/79 | Accuracy: 0.8185763888888888
Batch: 18/79 | Accuracy: 0.8170230263157895
Batch: 19/79 | Accuracy: 0.816015625
Batch: 20/79 | Accuracy: 0.8177083333333334
Batch: 21/79 | Accuracy: 0.8196022727272727
Batch: 22/79 | Accuracy: 0.8206521739130435
Batch: 23/79 | Accuracy: 0.8206380208333334
Batch: 24/79 | Accuracy: 0.8190625
Batch: 25/79 | A