## Installing Modules

In [1]:
!pip install spacy==3



In [2]:
!python -m spacy download en_core_web_sm

2021-06-10 14:57:23.215546: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Imports

In [3]:
# Import Library
import random
import torch, torchtext
from torchtext.legacy import data 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import pandas as pd
import sys, os, pickle
import numpy as np


import spacy
nlp = spacy.load('en_core_web_sm')

# Manual Seed
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


## Loading Data

In [4]:
!gdown --id 1wSZddg-GLEyHqAuwiqL-YOBGb6kDcwfp

df = pd.read_csv('tweets.csv')
df.head()

Downloading...
From: https://drive.google.com/uc?id=1wSZddg-GLEyHqAuwiqL-YOBGb6kDcwfp
To: /content/tweets.csv
  0% 0.00/160k [00:00<?, ?B/s]100% 160k/160k [00:00<00:00, 47.9MB/s]


Unnamed: 0,tweets,labels
0,Obama has called the GOP budget social Darwini...,1
1,"In his teen years, Obama has been known to use...",0
2,IPA Congratulates President Barack Obama for L...,0
3,RT @Professor_Why: #WhatsRomneyHiding - his co...,0
4,RT @wardollarshome: Obama has approved more ta...,1


## Defining Fields

In [5]:
Tweet = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)



In [6]:
fields = [('tweets', Tweet),('labels',Label)]
example = [data.Example.fromlist([df.tweets[i],df.labels[i]], fields) for i in range(df.shape[0])] 

twitterDataset = data.Dataset(example, fields)

In [7]:
(train, valid) = twitterDataset.split(split_ratio=[0.85, 0.15], random_state=random.seed(SEED))

In [8]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(valid)}')

Number of training examples: 1159
Number of testing examples: 205


In [9]:
print(vars(train.examples[0]))

{'tweets': ['RT', '@BelleInABow', ':', 'Obama', 'does', "n't", 'need', 'to', 'be', 'your', 'favorite', 'person', ',', 'but', 'he', "'s", 'still', 'the', 'President', 'of', 'the', 'United', 'States', 'of', 'America', '.', 'That', 'always', 'deserves', 'respect', '.'], 'labels': 1}


## Building Vocabulary

In [10]:
Tweet.build_vocab(train)
Label.build_vocab(train)

In [11]:
print('Size of input vocab : ', len(Tweet.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Tweet.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  4655
Size of label vocab :  3
Top 10 words appreared repeatedly : [('Obama', 1064), ('#', 798), (':', 777), ('.', 755), (',', 595), ('"', 558), ('the', 534), ('RT', 513), ('to', 398), ('?', 392)]
Labels :  defaultdict(None, {0: 0, 1: 1, 2: 2})


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iterator, valid_iterator = data.BucketIterator.splits((train, valid), batch_size = 32, 
                                                            sort_key = lambda x: len(x.tweets),
                                                            sort_within_batch=True, device = device)

In [13]:
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Tweet.vocab.stoi, tokens)

## Model

In [14]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True)        

        self.decoder = nn.LSTM(hidden_dim, 
                       hidden_dim, 
                       num_layers=n_layers, 
                       batch_first=True)

        self.fc= nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):

       
        embedded = self.embedding(text)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden_enc, cell_enc) = self.encoder(packed_embedded)
        output_enc, output_enc_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)   
        print(output_enc)
        output_dec, (hidden_dec, cell_dec) = self.decoder(output_enc)
        print(output_dec)
        output_dense = self.fc(hidden_dec)   

        output = F.softmax(output_dense[0], dim=1)

        return output

In [15]:

INPUT_DIM = len(Tweet.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 100
OUTPUT_DIM = 3
NUM_LAYERS = 2
DROPOUT = 0.2


model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, DROPOUT)

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,800,003 trainable parameters


In [17]:
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
criterion = nn.CrossEntropyLoss()

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [19]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [20]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        tweet, tweet_lengths = batch.tweets   
        
        # convert to 1D tensor
        predictions = model(tweet, tweet_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.labels)        
        
        # compute the binary accuracy
        acc = categorical_accuracy(predictions, batch.labels)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            tweet, tweet_lengths = batch.tweets
            
            # convert to 1d tensor
            predictions = model(tweet, tweet_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.labels)
            acc = categorical_accuracy(predictions, batch.labels)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          -9.9345e-07, -2.6027e-06],
         [-1.2340e-05, -4.4133e-06, -8.5405e-06,  ...,  1.3380e-05,
          -4.8180e-06, -7.1422e-06],
         ...,
         [-7.6292e-05, -3.8113e-06, -7.1493e-05,  ...,  7.1932e-05,
          -4.1041e-05, -4.2745e-05],
         [-7.6459e-05, -3.6781e-06, -7.1986e-05,  ...,  7.1952e-05,
          -4.0941e-05, -4.2896e-05],
         [-7.6688e-05, -3.5141e-06, -7.2424e-05,  ...,  7.2016e-05,
          -4.0910e-05, -4.3061e-05]],

        [[-2.0247e-06, -1.0424e-06, -1.0973e-06,  ...,  3.1725e-06,
          -5.6651e-07, -1.7250e-06],
         [-1.3046e-05, -3.5733e-06, -1.1426e-05,  ...,  1.3694e-05,
          -5.0249e-06, -8.8247e-06],
         [-3.3106e-05, -5.5365e-06, -2.8884e-05,  ...,  3.1916e-05,
          -1.4338e-05, -1.9570e-05],
         ...,
         [-7.6845e-05, -3.5307e-06, -7.2027e-05,  ...,  7.2070e-05,
          -4.1239e-05, -4.2949e-05],
         [-7.7107e-05, -3.18

## Model Testing

In [23]:
#load weights and tokenizer

path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 



def classify_tweet(tweet):
    
    categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
    
    # tokenize the tweet 
    tokenized = [tok.text for tok in nlp.tokenizer(tweet)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()]

In [24]:
classify_tweet("A valid explanation for why Trump won't let women on the golf course.")

tensor([[[ 0.0839, -0.0627, -0.0825,  ..., -0.0426, -0.0750, -0.0482],
         [ 0.1937, -0.1154, -0.1828,  ..., -0.0834, -0.1278, -0.1292],
         [ 0.3113, -0.1502, -0.2708,  ..., -0.1076, -0.1961, -0.2041],
         ...,
         [ 0.6527, -0.4815, -0.8081,  ..., -0.0718, -0.7175, -0.4683],
         [ 0.6836, -0.4875, -0.7952,  ..., -0.0728, -0.7162, -0.4802],
         [ 0.6706, -0.5181, -0.8372,  ..., -0.0509, -0.7595, -0.4574]]],
       device='cuda:0', grad_fn=<TransposeBackward0>)
tensor([[[-1.0397e-06, -4.9360e-07, -8.2656e-07,  ...,  1.1724e-06,
          -5.2796e-07, -6.2162e-07],
         [-3.9513e-06, -1.5027e-06, -3.2207e-06,  ...,  3.7692e-06,
          -2.1173e-06, -2.1966e-06],
         [-8.9399e-06, -2.7553e-06, -6.9920e-06,  ...,  8.2201e-06,
          -5.2166e-06, -4.5909e-06],
         ...,
         [-2.6777e-05, -3.7663e-06, -2.5168e-05,  ...,  2.5238e-05,
          -1.5981e-05, -1.2729e-05],
         [-2.6821e-05, -3.7482e-06, -2.5323e-05,  ...,  2.5267e-05,
  

'Negative'

In [25]:
for i in np.random.randint(0,len(valid),10):
  sent = " ".join((vars(valid.examples[i]))['tweets'])
  pred = classify_tweet(sent)
  label = (vars(valid.examples[i]))['labels']
  categories = {0: "Negative", 1:"Positive", 2:"Neutral"}
  print(f'Sentence: {sent[:60]} \t Predicted: {pred} \t Actual: {categories[label]}')
  

tensor([[[ 0.0619, -0.0506, -0.0729,  ..., -0.0113, -0.0349, -0.0390],
         [ 0.1409, -0.0733, -0.1024,  ..., -0.0628, -0.0629, -0.0849],
         [ 0.2322, -0.1946, -0.2378,  ..., -0.1223, -0.2371, -0.2334],
         ...,
         [ 0.7185, -0.4657, -0.8200,  ..., -0.0629, -0.7416, -0.5398],
         [ 0.6751, -0.4321, -0.8054,  ..., -0.0741, -0.7360, -0.5654],
         [ 0.6801, -0.4676, -0.8477,  ..., -0.0484, -0.7647, -0.5081]]],
       device='cuda:0', grad_fn=<TransposeBackward0>)
tensor([[[-6.6156e-07, -3.8608e-07, -3.9258e-07,  ...,  1.0034e-06,
          -2.3865e-07, -3.4380e-07],
         [-2.3832e-06, -1.0171e-06, -1.7535e-06,  ...,  2.6618e-06,
          -9.5414e-07, -1.2328e-06],
         [-6.7629e-06, -2.1811e-06, -5.5064e-06,  ...,  6.5268e-06,
          -3.4043e-06, -3.4645e-06],
         ...,
         [-2.6852e-05, -3.7677e-06, -2.5477e-05,  ...,  2.5290e-05,
          -1.5931e-05, -1.2840e-05],
         [-2.6862e-05, -3.7567e-06, -2.5495e-05,  ...,  2.5288e-05,
  