# Twitter Sentiment modeling using Glove and LSTM

In [1]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib'}
            
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


from spacy.lang.en import English
!python -m spacy download en_core_web_md
import en_core_web_md
en = English()
nlp = en_core_web_md.load()

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
# this will set the device on which to train
#device = torch.device("cpu")
# if using collab, set your runtime to use GPU and use the line below
device = torch.device("cuda:0")
#Ensure GPU active
print('GPU active', torch.cuda.is_available())


Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.1MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp36-none-any.whl size=98051305 sha256=b18f728089303eb318fef93eecbe4aaf9875156f7f753f4bd3ff6abe919c6fd4
  Stored in directory: /tmp/pip-ephem-wheel-cache-3huj1zv7/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
GPU active True


Utility methods

In [2]:
#remove punctuation and URLs, and stopwords
def tokenize(text, model=en, nostopwds=True):
   
    tokenlist = []
    doc = model(text)
    ent = ''
    for t in doc:
      
      if nostopwds and t.is_stop:
        #print(t.text)
        continue
      if t.like_url:
        tokenlist.append('URL')
        continue
      if not t.is_alpha:
        continue      
     
      tokenlist.append(t.lower_)
    return tokenlist

text= "Lol, th? oh @you:all got &amp friend for the d?g ?.. U.S. I'm at a  buffet... Cine there got amore wat... "
print(tokenize(text,nostopwds=False))


def doc_to_index(docs, vocab):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        for w in d:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=300):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features
    

['lol', 'th', 'oh', 'all', 'got', 'amp', 'friend', 'for', 'the', 'i', 'at', 'a', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


Read in tweets

In [3]:
with open('clean_tweets_20k.pkl', 'rb') as f:
    df_tweet = pickle.load(f)
df_tweet.head()

Unnamed: 0,Target,text
0,0,woke up at 6.05am today and have just been tos...
1,4,@aaronSTEREOS Virtual ((hi five)) for you too!...
2,4,@jorskwen omg i sooo cannot wait. once it's pr...
3,4,todays a better day
4,0,@Aje0916 stinky mcstinkface


## Split data three ways
50,30,20- Train, test, validation

In [4]:
from sklearn.model_selection import train_test_split
X = df_tweet['text']
df_tweet['Target'] = df_tweet['Target'].replace(4,1)
y = df_tweet['Target']
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y, test_size=.3, random_state=53)
X_train, X_val, y_train, y_val = train_test_split(X_trainval,y_trainval, test_size=.3, random_state=153)
print('Train count',X_train.count(), y_train.count())
print('Test count',X_test.count(), y_test.count())
print('Val count',X_val.count(), y_val.count())
y.value_counts()

Train count 9800 9800
Test count 6000 6000
Val count 4200 4200


1    10076
0     9924
Name: Target, dtype: int64

In [5]:
label = df_tweet['Target'].to_numpy()
all_tweets = df_tweet['text'].to_numpy()
print(label.shape, all_tweets.shape)


(20000,) (20000,)


In [6]:
#print(type(X_train), X_train)
parsed_train = [tokenize(str(d),nostopwds=False) for d in X_train]
parsed_val = [tokenize(str(d),nostopwds=False) for d in X_val]
parsed_test = [tokenize(str(d),nostopwds=False) for d in X_test]

In [None]:
print(len(parsed_train), parsed_train[0])

9800 ['i', 'really', 'really', 'really', 'miss', 'u', 'no', 'bullshit', 'no', 'other', 'motives', 'i', 'actually', 'just', 'want', 'b', 'in', 'ur', 'presence', 'amp', 'talk', 'u', 'again', 'like', 'now']


In [7]:
label_train = y_train.to_numpy()
label_test = y_test.to_numpy()
label_val = y_val.to_numpy()
print(label_train.shape)

(9800,)


In [8]:
# construct glove weight matrix
# construct vocab
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=.0001)
cv.fit(parsed_train)
vocab = cv.vocabulary_    #word to index dictionary
print("Size of vocab:", len(vocab), type(vocab))

vocab = dict([(v, vocab[v]+2) for v in vocab])
vocab['_UNK'] = 1
vocab['_PAD'] = 0
glove_vecs = np.zeros(shape=(len(vocab), 300))
for k, v in vocab.items():
    glove_vecs[v] = nlp(k).vector
print(glove_vecs.shape)



Size of vocab: 12664 <class 'dict'>
(12666, 300)


In [9]:
#save vocab dictionary
with open('vocab_20k.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [10]:

# idx  has indexes of words in vocab dictionary
#padded pads to 200 length each word sentence if needed
idx_train = doc_to_index(parsed_train, vocab)
padded_train = pad_sequence(idx_train)
idx_val = doc_to_index(parsed_val, vocab)
padded_val = pad_sequence(idx_val)
idx_test = doc_to_index(parsed_test, vocab)
padded_test = pad_sequence(idx_test)
print(len(idx_val),idx_val)
print(len(padded_val), padded_val, len(padded_val[0]))  #size is 2100 by 300 for seq length

4200 [[5296, 8812, 11186, 5406, 1, 1, 360, 5296, 4523, 10762, 9471, 7454, 8277, 11179, 4454, 9341], [7569, 6309, 11049, 4434, 12078], [2, 1285, 3, 1, 1548, 7518, 12244], [7725, 11000, 11323, 4469, 11710, 11179, 6429, 1, 3684], [12580, 499, 11235, 5984, 6416, 10991], [12195, 11000, 1, 8231, 1827, 5590, 140, 360, 5026, 5296, 5296, 12276, 11022, 12026, 7129, 7654, 5608], [4997, 2628, 12053, 11710, 5203, 11578, 3900, 11524, 11578, 1, 1813, 1, 750, 7279, 6642, 5608, 6439, 2346, 6981, 5590, 12132, 4746, 6551, 12473, 12459], [12122, 5406, 11000, 8174, 5608, 6444, 4600, 4482, 6526, 605, 11000, 1], [360, 10205, 5296, 6248, 12580, 4711, 3120], [7518, 4893, 2928, 10997, 9353, 4482], [5590, 3794, 11710, 360, 5590, 44, 11179, 4454, 5094, 1, 3, 7470, 7843, 603], [5296, 3803, 6303, 12416, 3, 1141, 11186, 1548, 12154, 44], [6416, 5296, 1000, 11000, 1, 12236, 889, 4913, 4171, 12580, 304, 6254, 5144, 11000, 9919, 5590, 8936, 5182], [7518, 1548, 5296, 6026, 12209, 8098, 8859, 12584, 11496], [5590, 3794, 

In [None]:
print(type(padded_train), padded_train.shape, type(label_train), label_train.shape)

<class 'numpy.ndarray'> (4900, 200) <class 'numpy.ndarray'> (4900,)


In [11]:
class SentimentNet(nn.Module):
    # sentiment classifier with single LSTM layer + Fully-connected layer, sigmoid activation and dropout
    # adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self,
                 weight_matrix=None,
                 vocab_size=1000, 
                 output_size=1,  
                 hidden_dim=512,
                 embedding_dim=400, 
                 n_layers=2, 
                 dropout_prob=0.5):
        super(SentimentNet, self).__init__()
        # size of the output, in this case it's one input to one output
        self.output_size = output_size
        # number of layers (default 2) one LSTM layer, one fully-connected layer
        self.n_layers = n_layers
        # dimensions of our hidden state, what is passed from one time point to the next
        self.hidden_dim = hidden_dim
        # initialize the representation to pass to the LSTM
        self.embedding, embedding_dim = self.init_embedding(
            vocab_size, 
            embedding_dim, 
            weight_matrix)
        # LSTM layer, where the magic happens
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        # dropout, similar to regularization
        self.dropout = nn.Dropout(dropout_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        # transform input
        embeds = self.embedding(x)
        # run input embedding + hidden state through model
        lstm_out, hidden = self.lstm(embeds, hidden)
        # reshape
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout certain pct of connections
        out = self.dropout(lstm_out)
        # fully connected layer
        out = self.fc(out)
        # activation function
        out = self.sigmoid(out)
        # reshape
        out = out.view(batch_size, -1)
        out = out[:,-1]
        # return the output and the hidden state
        return out, hidden
    
    def init_embedding(self, vocab_size, embedding_dim, weight_matrix):
        # initializes the embedding
        if weight_matrix is None:
            if vocab_size is None:
                raise ValueError('If no weight matrix, need a vocab size')
            # if embedding is a size, initialize trainable
            return(nn.Embedding(vocab_size, embedding_dim),
                   embedding_dim)
        else:
            # otherwise use matrix as pretrained
            weights = torch.FloatTensor(weight_matrix)
            return(nn.Embedding.from_pretrained(weights),
                  weights.shape[1])
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden

In [12]:
def train_model(model, train_loader, val_loader, model_params, training_params):
    # utility for running the training process
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=training_params['learning_rate'])
    epochs = training_params['epochs']
    batch_size = training_params['batch_size']
    # print options
    counter = 0
    print_every = 5
    clip = 5
    valid_loss_min = np.Inf
    #print(training_params['learning_rate'])
    model.train()
    for i in range(epochs):
        h = model.init_hidden(batch_size)
        for inputs, labels in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            output, h = model(inputs, h)
            loss = criterion(output.squeeze(), labels.float())
            #print('loss',loss.item())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            if counter%print_every == 0:
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for inp, lab in val_loader:
                    val_h = tuple([each.data for each in val_h])
                    inp, lab = inp.to(device), lab.to(device)
                    out, val_h = model(inp, val_h)
                    val_loss = criterion(out.squeeze(), lab.float())
                    val_losses.append(val_loss.item())
                    #print('val loss',val_loss.item())

                model.train()
                
                print("Epoch: {}/{}...".format(i+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
                
                if np.mean(val_losses) <= valid_loss_min:
                    torch.save(model.state_dict(), './state_dict.pt')
                    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                    valid_loss_min = np.mean(val_losses)
        #print('counter',counter)
    return(model)
    
def assess_accuracy(model, test_loader, model_params, training_params):
    # utility for assessing accuracy
    batch_size = training_params['batch_size']
    model.load_state_dict(torch.load('./state_dict.pt'))
    h = model.init_hidden(batch_size)
    num_correct = 0
    model.eval()
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model(inputs, h)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        # take the correct labels, check against preds
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        # sum the number of correct
        num_correct += np.sum(correct)
    # calc accuracy
    test_acc = num_correct/len(test_loader.dataset)
    print('LSTM accuracy:', test_acc)
    return test_acc

In [13]:
#construct datasets for loading by PyTorch

train_data = TensorDataset(torch.from_numpy(padded_train), torch.from_numpy(label_train))
val_data = TensorDataset(torch.from_numpy(padded_val), torch.from_numpy(label_val))
test_data = TensorDataset(torch.from_numpy(padded_test), torch.from_numpy(label_test))


In [None]:
glove_vecs.shape

(12665, 300)

#Train multiple iterations. Ideally should be a nice looping mechanism for tuning various parameters but due to system and time limitation just trying a few options.

In [14]:

model_params = {'weight_matrix': glove_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'dropout_prob': 0.001}

print('Start training')

training_params={'learning_rate':.01,
                'batch_size':1000,
                'epochs':10}

batch_size = training_params['batch_size']
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

model = SentimentNet(**model_params)

trainedmodel = train_model(model, train_loader, val_loader, model_params, training_params)
acc = assess_accuracy(trainedmodel, test_loader, model_params, training_params)

torch.save(trainedmodel, './lstm_model_20k.pt')

Start training
Epoch: 1/10... Step: 5... Loss: 0.837169... Val Loss: 0.760624
Validation loss decreased (inf --> 0.760624).  Saving model ...
Epoch: 2/10... Step: 10... Loss: 0.694106... Val Loss: 0.693390
Validation loss decreased (0.760624 --> 0.693390).  Saving model ...
Epoch: 2/10... Step: 15... Loss: 0.698848... Val Loss: 0.698375
Epoch: 3/10... Step: 20... Loss: 0.702969... Val Loss: 0.696326
Epoch: 3/10... Step: 25... Loss: 0.693145... Val Loss: 0.695044
Epoch: 4/10... Step: 30... Loss: 0.693017... Val Loss: 0.693184
Validation loss decreased (0.693390 --> 0.693184).  Saving model ...
Epoch: 4/10... Step: 35... Loss: 0.692664... Val Loss: 0.693288
Epoch: 5/10... Step: 40... Loss: 0.693331... Val Loss: 0.693264
Epoch: 5/10... Step: 45... Loss: 0.692599... Val Loss: 0.693144
Validation loss decreased (0.693184 --> 0.693144).  Saving model ...
Epoch: 6/10... Step: 50... Loss: 0.693434... Val Loss: 0.692945
Validation loss decreased (0.693144 --> 0.692945).  Saving model ...
Epoch:

  "type " + obj.__name__ + ". It won't be checked "


Only 50 % accuracy

In [15]:
#test run 20 iterations
model_params = {'weight_matrix': glove_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'dropout_prob': 0.001}

print('Start training')

training_params={'learning_rate':.01,
                'batch_size':1000,
                'epochs':20}

batch_size = training_params['batch_size']
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

model = SentimentNet(**model_params)

trainedmodel = train_model(model, train_loader, val_loader, model_params, training_params)
acc = assess_accuracy(trainedmodel, test_loader, model_params, training_params)

torch.save(trainedmodel, './lstm_model_20k_2.pt')

Start training
Epoch: 1/20... Step: 5... Loss: 0.709790... Val Loss: 0.693461
Validation loss decreased (inf --> 0.693461).  Saving model ...
Epoch: 2/20... Step: 10... Loss: 0.715023... Val Loss: 1.741908
Epoch: 2/20... Step: 15... Loss: 0.693606... Val Loss: 0.703359
Epoch: 3/20... Step: 20... Loss: 0.692345... Val Loss: 0.683308
Validation loss decreased (0.693461 --> 0.683308).  Saving model ...
Epoch: 3/20... Step: 25... Loss: 0.666897... Val Loss: 0.756612
Epoch: 4/20... Step: 30... Loss: 0.678770... Val Loss: 0.681154
Validation loss decreased (0.683308 --> 0.681154).  Saving model ...
Epoch: 4/20... Step: 35... Loss: 0.667444... Val Loss: 0.663672
Validation loss decreased (0.681154 --> 0.663672).  Saving model ...
Epoch: 5/20... Step: 40... Loss: 0.629398... Val Loss: 0.726714
Epoch: 5/20... Step: 45... Loss: 0.637992... Val Loss: 0.659084
Validation loss decreased (0.663672 --> 0.659084).  Saving model ...
Epoch: 6/20... Step: 50... Loss: 0.634481... Val Loss: 0.632270
Valida

  "type " + obj.__name__ + ". It won't be checked "


Drastic improvement to .7433 accuracy

In [16]:
model_params = {'weight_matrix': glove_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'dropout_prob': 0.001}

print('Start training')

training_params={'learning_rate':.01,
                'batch_size':500,
                'epochs':10}

batch_size = training_params['batch_size']
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

model = SentimentNet(**model_params)

trainedmodel = train_model(model, train_loader, val_loader, model_params, training_params)
acc = assess_accuracy(trainedmodel, test_loader, model_params, training_params)

torch.save(trainedmodel, './lstm_model_20k_3.pt')

Start training
Epoch: 1/10... Step: 5... Loss: 0.747051... Val Loss: 0.766833
Validation loss decreased (inf --> 0.766833).  Saving model ...
Epoch: 1/10... Step: 10... Loss: 0.688765... Val Loss: 0.687030
Validation loss decreased (0.766833 --> 0.687030).  Saving model ...
Epoch: 1/10... Step: 15... Loss: 0.681316... Val Loss: 0.673551
Validation loss decreased (0.687030 --> 0.673551).  Saving model ...
Epoch: 2/10... Step: 20... Loss: 0.631607... Val Loss: 0.593319
Validation loss decreased (0.673551 --> 0.593319).  Saving model ...
Epoch: 2/10... Step: 25... Loss: 0.588139... Val Loss: 0.623209
Epoch: 2/10... Step: 30... Loss: 0.583294... Val Loss: 0.581152
Validation loss decreased (0.593319 --> 0.581152).  Saving model ...
Epoch: 2/10... Step: 35... Loss: 0.580151... Val Loss: 0.554258
Validation loss decreased (0.581152 --> 0.554258).  Saving model ...
Epoch: 3/10... Step: 40... Loss: 0.577434... Val Loss: 0.533067
Validation loss decreased (0.554258 --> 0.533067).  Saving model 

  "type " + obj.__name__ + ". It won't be checked "


In [28]:
model_params = {'weight_matrix': glove_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'dropout_prob': 0.001}

print('Start training')

training_params={'learning_rate':.01,
                'batch_size':500,
                'epochs':25}

batch_size = training_params['batch_size']
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

model = SentimentNet(**model_params)

trainedmodel = train_model(model, train_loader, val_loader, model_params, training_params)
acc = assess_accuracy(trainedmodel, test_loader, model_params, training_params)

torch.save(trainedmodel, './lstm_model_20k_4.pt')

Start training
Epoch: 1/25... Step: 5... Loss: 0.746736... Val Loss: 0.718121
Validation loss decreased (inf --> 0.718121).  Saving model ...
Epoch: 1/25... Step: 10... Loss: 0.752775... Val Loss: 0.697846
Validation loss decreased (0.718121 --> 0.697846).  Saving model ...
Epoch: 1/25... Step: 15... Loss: 0.693148... Val Loss: 0.737211
Epoch: 2/25... Step: 20... Loss: 0.701163... Val Loss: 0.690895
Validation loss decreased (0.697846 --> 0.690895).  Saving model ...
Epoch: 2/25... Step: 25... Loss: 0.691809... Val Loss: 0.693848
Epoch: 2/25... Step: 30... Loss: 0.699478... Val Loss: 0.690668
Validation loss decreased (0.690895 --> 0.690668).  Saving model ...
Epoch: 2/25... Step: 35... Loss: 0.682450... Val Loss: 0.682670
Validation loss decreased (0.690668 --> 0.682670).  Saving model ...
Epoch: 3/25... Step: 40... Loss: 0.669544... Val Loss: 0.668988
Validation loss decreased (0.682670 --> 0.668988).  Saving model ...
Epoch: 3/25... Step: 45... Loss: 0.715576... Val Loss: 0.683592
E

  "type " + obj.__name__ + ". It won't be checked "


Performance improved with lower batch size and more epochs - .765
#Writing a predict method for the model to test prediction for 1 tweet

In [25]:
def predict(model,vocab, text, seqlen=300):
  parsed_text = tokenize(str(text),nostopwds=False)

  ##create word index
  w_idx = []
  for w in parsed_text:
    if w in vocab:
      w_idx.append(vocab[w])
    else:
      # unknown token = 1
      w_idx.append(1)
       
  print(parsed_text,'\n',w_idx)
  ## pad vector to 300 length 
  padded_text = np.zeros((1,seqlen),dtype=int)
  padded_text[0,-len(w_idx):] = np.array(w_idx)[:seqlen]
  #padded_text = pad_sequence(w_idx)
  print('padded_text', padded_text.shape)
  tensor_data = torch.from_numpy(padded_text)
  batch_size = tensor_data.size(0)
  print('batchsize',batch_size)

  model.eval()
  h = model.init_hidden(batch_size)
  tensor_data = tensor_data.to(device)
  output, h = model(tensor_data, h)
  # takes output, rounds to 0/1
  pred = torch.round(output.squeeze())
  if (pred.item() == 0):
     print('prediction negative sentiment', pred.item())
  else:
    print('prediction positive sentiment', pred.item())

In [27]:
idx = np.random.randint(len(df_tweet))
text = df_tweet.iloc[idx]['text']
print('actual data',df_tweet.iloc[idx]['text'], df_tweet.iloc[idx]['Target'])
predict(model,vocab, text)


actual data good day!  // blessed vibes!  1
['good', 'day', 'blessed', 'vibes'] 
 [4482, 2719, 1121, 11846]
padded_text (1, 300)
batchsize 1
prediction positive sentiment 1.0
