In [127]:
import pandas as pd
import numpy as np
import random
import torch, torchtext
from torchtext import data
import random

In [128]:
df = pd.read_csv("/content/tweets.csv")
df.head()

Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


In [129]:
df.shape

(1364, 2)

In [130]:
df.labels.value_counts()

0    931
1    352
2     81
Name: labels, dtype: int64

In [131]:
SEED = 1234
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f27f47b0970>

In [132]:
Tweet = torchtext.legacy.data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = torchtext.legacy.data.LabelField(tokenize = 'spacy', is_target=True, batch_first =True, sequential = False)

In [133]:
fields = [('tweets', Tweet), ('labels', Label)]

In [134]:
example = [torchtext.legacy.data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

In [135]:
twitterDataset = torchtext.legacy.data.Dataset(example, fields)

In [136]:
(train, valid, test) = twitterDataset.split(split_ratio=[70, 15, 15], random_state = random.seed(SEED))

In [137]:
len(train), len(valid), len(test)

(955, 204, 205)

In [138]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [139]:
print(f'Size of input vocab : ', len(Tweet.vocab))
print(f'Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4132
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 875), ('#', 670), (':', 637), ('.', 631), (',', 488), ('"', 444), ('the', 434), ('RT', 423), ('to', 320), ('?', 312)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [140]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [141]:
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((train, valid, test), batch_size = 16, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [142]:
next(iter(train_iterator))


[torchtext.legacy.data.batch.Batch of size 16]
	[.tweets]:('[torch.LongTensor of size 16x22]', '[torch.LongTensor of size 16]')
	[.labels]:[torch.LongTensor of size 16]

In [143]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

In [144]:
import torch.nn as nn
import torch.nn.functional as F

In [145]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, batch_first=True)
        # Here you can use nn.GRU(emb_dim, hid_dim) or 
        # nn.RNN(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        #self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_len):
        
        #src = [src len, batch size]
        
        embedded = self.embedding(text)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_len.cpu(), batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        print(packed_output[0])
        print(hidden)

        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [146]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        #self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(hid_dim, hid_dim, num_layers=n_layers)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        #self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        #input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        #embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        output , (hidden,cell) = self.rnn(input,(hidden,cell))
                
        #output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(hidden.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [147]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        #print(encoder.hid_dim)
        #print(decoder.hid_dim)
        
        
    def forward(self, text, text_len):

        #hidden, cell = self.encoder(text,text_len)

        hidden, cell = self.encoder(text,text_len)
        
        input = hidden

        output,hidden,cell = self.decoder(input,hidden,cell)

        return output

In [148]:
INPUT_DIM = len(Tweet.vocab)
OUTPUT_DIM = len(Label.vocab)
ENC_EMB_DIM = 256
#DEC_EMB_DIM = 256
HID_DIM = 512
NUM_LAYERS = 1
#ENC_DROPOUT = 0.5
#DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, NUM_LAYERS)
dec = Decoder(OUTPUT_DIM, HID_DIM, NUM_LAYERS)

model = Seq2Seq(enc, dec, device).to(device)

In [149]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4132, 256)
    (rnn): LSTM(256, 512, batch_first=True)
  )
  (decoder): Decoder(
    (rnn): LSTM(512, 512)
    (fc_out): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [150]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,737,539 trainable parameters


In [151]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [152]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [153]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = binary_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [154]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\t Epoch: {epoch} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Epoch: {epoch} | Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         [-8.0515e-05,  8.0611e-02,  4.3958e-02,  ..., -8.5936e-02,
          -1.9301e-02,  8.3078e-02],
         [-4.8192e-03,  3.5157e-03, -4.3486e-03,  ..., -1.9609e-02,
           3.7288e-03,  4.1609e-02],
         [-4.8192e-03,  3.5157e-03, -4.3486e-03,  ..., -1.9609e-02,
           3.7288e-03,  4.1609e-02]]])
tensor([[-0.0006, -0.0050,  0.0041,  ..., -0.0112,  0.0097,  0.0135],
        [-0.0006, -0.0050,  0.0041,  ..., -0.0112,  0.0097,  0.0135],
        [-0.0006, -0.0050,  0.0041,  ..., -0.0112,  0.0097,  0.0135],
        ...,
        [-0.0149,  0.1817,  0.0177,  ..., -0.1472, -0.0718,  0.1594],
        [-0.0145,  0.1681,  0.0135,  ..., -0.1294, -0.0904,  0.1373],
        [-0.0149,  0.1817,  0.0177,  ..., -0.1472, -0.0718,  0.1594]])
tensor([[[-0.0036,  0.1683,  0.0224,  ..., -0.1275, -0.0584,  0.1625],
         [-0.0165,  0.1654,  0.0307,  ..., -0.1469, -0.0704,  0.1579],
         [ 0.0142,  0.0700,  0.0199,  ...,

In [155]:
model.load_state_dict(torch.load('saved_weights.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

tensor([[ 0.0155,  0.0143,  0.0224,  ..., -0.0256,  0.0080,  0.0292],
        [-0.0004, -0.0067,  0.0031,  ..., -0.0100,  0.0112,  0.0133],
        [ 0.0155,  0.0143,  0.0224,  ..., -0.0256,  0.0080,  0.0292],
        ...,
        [ 0.0049,  0.0557,  0.0015,  ..., -0.0491, -0.0065,  0.0900],
        [ 0.0098,  0.0849,  0.0272,  ..., -0.0999, -0.0264,  0.1057],
        [-0.0024,  0.1749,  0.0307,  ..., -0.1320, -0.0902,  0.1609]])
tensor([[[-1.3401e-02,  6.0917e-02, -1.3910e-03,  ..., -7.5410e-02,
          -1.5644e-02,  8.8389e-02],
         [-7.9631e-03,  1.6865e-01,  4.0862e-02,  ..., -1.4001e-01,
          -9.6008e-02,  1.6248e-01],
         [-1.1132e-02,  6.3603e-02,  1.2421e-04,  ..., -8.8473e-02,
          -1.9058e-02,  1.1184e-01],
         ...,
         [ 9.3821e-03,  4.9345e-02,  1.6371e-02,  ..., -4.5711e-02,
          -1.5305e-02,  6.8913e-02],
         [ 8.2067e-03,  7.0082e-02,  2.1768e-02,  ..., -5.3151e-02,
          -2.5721e-02,  9.4326e-02],
         [ 1.3803e-02,  4.1

In [156]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [157]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

tensor([[ 0.0210, -0.0197,  0.0242,  ...,  0.0091,  0.0325,  0.0210],
        [ 0.0210,  0.0033,  0.0217,  ..., -0.0066,  0.0355,  0.0303],
        [ 0.0220,  0.0168,  0.0178,  ..., -0.0330,  0.0207,  0.0623],
        ...,
        [-0.0113,  0.0594,  0.0137,  ..., -0.0901,  0.0098,  0.0859],
        [ 0.0072,  0.0260,  0.0115,  ..., -0.0590,  0.0381,  0.0646],
        [ 0.0184, -0.0123,  0.0127,  ..., -0.0446,  0.0580,  0.0564]],
       grad_fn=<CatBackward>)
tensor([[[ 1.8373e-02, -1.2261e-02,  1.2666e-02, -9.7141e-02,  8.6334e-03,
          -6.0782e-02, -1.1347e-02,  7.2468e-02,  4.1599e-02,  7.7586e-02,
          -1.1510e-02, -4.8481e-02,  4.4706e-04, -2.4693e-02, -6.4700e-02,
           2.8198e-02, -4.3064e-03, -3.4699e-02,  8.1605e-02,  4.5041e-02,
          -5.7188e-02,  8.5746e-02,  2.3082e-02,  5.8820e-02,  9.6590e-02,
          -3.2810e-02,  1.0441e-01,  6.3877e-02,  6.9899e-02,  1.5430e-02,
          -1.1115e-02,  9.1302e-02, -5.8680e-03, -2.6451e-02,  3.6626e-02,
           

'Positive'

In [158]:
classify_tweet("In his teen years, Obama has been known to use marijuana and cocaine.")

tensor([[-3.8041e-05,  2.1364e-03, -4.0289e-03,  ..., -1.0252e-02,
          1.3322e-02,  2.9709e-02],
        [ 1.4595e-02, -3.1903e-03,  3.9833e-03,  ..., -2.3418e-02,
          2.3923e-02,  5.6977e-02],
        [ 1.7002e-02,  2.1314e-02,  1.0119e-03,  ..., -2.9120e-02,
          1.3054e-02,  5.0704e-02],
        ...,
        [-3.8012e-03,  2.4857e-01,  5.6312e-02,  ..., -2.1072e-01,
         -2.1361e-01,  1.9703e-01],
        [-6.6846e-03,  2.9718e-01,  5.1003e-02,  ..., -2.3067e-01,
         -2.5275e-01,  2.0045e-01],
        [-1.4381e-02,  3.1642e-01,  4.2303e-02,  ..., -2.3787e-01,
         -2.6265e-01,  2.2136e-01]], grad_fn=<CatBackward>)
tensor([[[-1.4381e-02,  3.1642e-01,  4.2303e-02, -2.2719e-01, -1.2064e-01,
           1.4402e-01,  3.8223e-02,  2.7600e-01,  5.5136e-02,  8.1836e-02,
          -5.2942e-02, -1.5530e-01,  2.2224e-01,  7.4997e-02, -6.1915e-03,
          -2.8314e-02, -3.2588e-02, -1.1915e-01, -6.1170e-02, -1.2788e-01,
           2.5544e-02,  1.1416e-01,  3.2257e-

'Negative'