In [1]:
#### CODE ADOPTED FROM https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948 ####

In [2]:
import numpy as np
import torch

from pandas import read_csv
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset# create Tensor datasets

In [3]:
filepath = 'dataset.csv'
column_names = ["Tweet", "Label"]

In [50]:
# membuat list dengan isi tweet dan label dari dataset
df = read_csv(filepath, header=0, sep=',', names=column_names)
tweets = df.Tweet.to_list()
labels = df.Label.to_list()

print(df)
print(len(tweets))
print(len(labels))

                                                 Tweet          Label
0    aaa kakk dah vaksin pulang mam minum obat dara...  Setuju Vaksin
1                             aaa tempat vaksin pulang  Setuju Vaksin
2    abis dpt vaksin sekolah ajar ptm ya nggak dari...  Setuju Vaksin
3                                          abis vaksin  Setuju Vaksin
4                             abis vaksin bawa ngantuk  Setuju Vaksin
..                                                 ...            ...
716                        yoon gue vaksin gue takuttt  Setuju Vaksin
717  yuk sobat sehat ajak anak vaksin covid lupa aj...  Setuju Vaksin
718                yuk vaksin jokowima ruf pimpin baik  Setuju Vaksin
719  zaman justifikasi benda justifikasi nilai juju...   Tidak Setuju
720  zoo kalo vaksin kucing domestik tuu berapa yaa...   Tidak Setuju

[721 rows x 2 columns]
721
721


In [7]:
words_ = ' '.join(tweets)
# buat list dengan isi semua kata yang ada di tweets
words = words_.split()
# hitung jumlah kata menggunakan class Counter
count_words = Counter(words)
words_count = len(count_words)
sorted_words = count_words.most_common(words_count)

In [8]:
# tokenize words
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

In [46]:
# tokenize tweets
tweets_int = []
for tweet in tweets:
    r = [vocab_to_int[w] for w in tweet.split()]
    tweets_int.append(r)
print (tweets_int[0:3])

[[426, 591, 17, 1, 320, 988, 99, 100, 208, 17, 208], [426, 120, 1, 320], [11, 209, 1, 171, 172, 427, 7, 141, 989, 321, 7]]


In [56]:
# tokenize labels
encoded_labels = [1 if label =='Tidak Setuju' else 0 for label in labels]
encoded_labels = np.array(encoded_labels)
print(encoded_labels[0:3])

[0 0 0]


In [57]:
# padding / truncating remaining data
def pad_features(tweets_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(tweets_int), seq_length), dtype = int)
    
    for i, tweet in enumerate(tweets_int):
        tweet_len = len(tweet)
        
        if tweet_len <= seq_length:
            zeroes = list(np.zeros(seq_length-tweet_len))
            new = zeroes+tweet
        elif tweet_len > seq_length:
            new = tweet[0:seq_length]
        
        features[i,:] = np.array(new)
        
    return features

sequence_length = 25
features = pad_features(tweets_int, sequence_length)

print (features[:10,:])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0 426 591  17   1
  320 988  99 100 208  17 208]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0 426 120   1 320]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  11 209   1 171
  172 427   7 141 989 321   7]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0  11   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  11   1 428 252]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  11   1 990 991 592
  992 429 253 993  78 593 594]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   11   1 595  99 994   5  30]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  11   1 596]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  11
    1   6 173 121 995 996 997]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  

In [58]:
# split dataset to train 80% validation 10% test 10%
split_frac = 0.8
train_x = features[0:int(split_frac*len(features))]
train_y = encoded_labels[0:int(split_frac*len(features))]

remaining_x = features[int(split_frac*len(features)):]
remaining_y = encoded_labels[int(split_frac*len(features)):]

valid_x = remaining_x[0:int(len(remaining_x)*0.5)]
valid_y = remaining_y[0:int(len(remaining_y)*0.5)]

test_x = remaining_x[int(len(remaining_x)*0.5):]
test_y = remaining_y[int(len(remaining_y)*0.5):]

In [59]:
# data loading and batching
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))# dataloaders
batch_size = 50 # make sure to SHUFFLE your data
# train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

In [60]:
# define the model class
import torch.nn as nn

class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

In [61]:
# training
# instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentLSTM(
  (embedding): Embedding(2561, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [62]:
# training data
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping
train_on_gpu = 0

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

RuntimeError: Expected hidden[0] size (2, 26, 256), got [2, 50, 256]

In [16]:
# testing data
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

RuntimeError: Expected hidden[0] size (2, 23, 256), got [2, 50, 256]