# Twitter Sentiment modeling using Glove and LSTM

In [1]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib'}
           
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


from spacy.lang.en import English
!python -m spacy download en_core_web_md
import en_core_web_md
en = English()
nlp = en_core_web_md.load()

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
# this will set the device on which to train
#device = torch.device("cpu")
# if using collab, set your runtime to use GPU and use the line below
device = torch.device("cuda:0")
#Ensure GPU active
print('GPU active', torch.cuda.is_available())


Collecting en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K     |████████████████████████████████| 95.4MB 1.1MB/s 
[?25hBuilding wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.1.0-cp36-none-any.whl size=97126236 sha256=8aa0aa2000858c0aab17a77a878e78ea5a2c0a5481960657044e6662b30fa5da
  Stored in directory: /tmp/pip-ephem-wheel-cache-1c_7zxf8/wheels/c1/2c/5f/fd7f3ec336bf97b0809c86264d2831c5dfb00fc2e239d1bb01
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')
GPU active True


Utility methods

In [11]:
#remove punctuation and URLs, and stopwords
def tokenize(text, model=en, nostopwds=True):
   
    tokenlist = []
    doc = model(text)
    ent = ''
    for t in doc:
      
      if nostopwds and t.is_stop:
        #print(t.text)
        continue
      if t.like_url:
        tokenlist.append('URL')
        continue
      if not t.is_alpha:
        continue      
     
      tokenlist.append(t.lower_)
    return tokenlist

text= "Lol, th? oh @you:all got &amp friend for the d?g ?.. U.S. I'm at a  buffet... Cine there got amore wat... "
print(tokenize(text,nostopwds=False))


def doc_to_index(docs, vocab):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        for w in d:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=200):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features
    

['lol', 'th', 'oh', 'all', 'got', 'amp', 'friend', 'for', 'the', 'i', 'at', 'a', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


Read in tweets

In [3]:
with open('clean_tweets_50k.pkl', 'rb') as f:
    df_tweet = pickle.load(f)
df_tweet.head()

Unnamed: 0,Target,text
0,4,@idontnow1 Hey nothing much and you ?
1,0,Had a wonderful dinner with his cuz. Explored ...
2,4,"@CarloAtYourServ what I got inside, I can't ma..."
3,0,The dog ate my homework. . . no. . . make that...
4,4,Bruno arghhhh i cant wait


## Split data three ways
50,30,20- Train, test, validation

In [4]:
from sklearn.model_selection import train_test_split
X = df_tweet['text']
df_tweet['Target'] = df_tweet['Target'].replace(4,1)
y = df_tweet['Target']
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y, test_size=.3, random_state=53)
X_train, X_val, y_train, y_val = train_test_split(X_trainval,y_trainval, test_size=.3, random_state=153)
print('Train count',X_train.count(), y_train.count())
print('Test count',X_test.count(), y_test.count())
print('Val count',X_val.count(), y_val.count())
y.value_counts()

Train count 24500 24500
Test count 15000 15000
Val count 10500 10500


0    25010
1    24990
Name: Target, dtype: int64

In [5]:
label = df_tweet['Target'].to_numpy()
all_tweets = df_tweet['text'].to_numpy()
print(label.shape, all_tweets.shape)


(50000,) (50000,)


In [6]:
#print(type(X_train), X_train)
parsed_train = [tokenize(str(d),nostopwds=False) for d in X_train]
parsed_val = [tokenize(str(d),nostopwds=False) for d in X_val]
parsed_test = [tokenize(str(d),nostopwds=False) for d in X_test]

In [None]:
print(len(parsed_train), parsed_train[0])

24500 ['cake', 'is', 'shit', 'i', 'have', 'successfully', 'made', 'cake', 'that', 'tastes', 'of', 'olive', 'oil', 'impressive']


In [7]:
label_train = y_train.to_numpy()
label_test = y_test.to_numpy()
label_val = y_val.to_numpy()
print(label_train.shape)

(24500,)


In [12]:
# construct glove weight matrix
# construct vocab
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=.0001)
cv.fit(parsed_train)
vocab = cv.vocabulary_    #word to index dictionary
print("Size of vocab:", len(vocab), type(vocab))

vocab = dict([(v, vocab[v]+2) for v in vocab])
vocab['_UNK'] = 1
vocab['_PAD'] = 0
glove_vecs = np.zeros(shape=(len(vocab), 300))
for k, v in vocab.items():
    glove_vecs[v] = nlp(k).vector
print(glove_vecs.shape)



Size of vocab: 6178 <class 'dict'>
(6180, 300)


In [13]:
#save vocab dictionary
with open('vocab_50k.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [14]:

# idx  has indexes of words in vocab dictionary
#padded pads to 200 length each word sentence if needed
idx_train = doc_to_index(parsed_train, vocab)
padded_train = pad_sequence(idx_train)
idx_val = doc_to_index(parsed_val, vocab)
padded_val = pad_sequence(idx_val)
idx_test = doc_to_index(parsed_test, vocab)
padded_test = pad_sequence(idx_test)
print(len(idx_val),idx_val)
print(len(padded_val), padded_val, len(padded_val[0]))  #size is word length by 200 for seq length

10500 [[2635, 475, 5720, 4817, 1172, 2195, 378, 5483, 4856], [2409, 2409, 2635, 4659, 5948, 2635, 798, 1940, 2015, 6100, 5242, 3148], [2635, 163, 4905, 4200, 5483, 4601, 5387, 2635, 163, 5389, 1321, 3728, 5389, 2930, 5948, 1, 5389, 1629, 5999, 2497, 2360, 1, 3470], [2750, 4558, 4836, 311, 2532, 5936, 4732, 1172, 448, 2201, 5483, 3, 1], [6108, 2015, 2094, 3358, 1714, 5389, 2117], [2750, 5389, 1816, 5407, 1, 5483, 5389, 1988, 3788, 2750, 2759, 4933, 1684], [3148, 2195, 6145], [5424, 5404, 5924, 2693, 5404, 3407, 1636, 4905, 1, 3550, 1, 756, 5392, 2005, 2750, 2268, 2635, 3438, 1, 3697], [5501, 2635, 163, 2201, 5483, 2728, 3, 1, 1, 2759, 2750, 2205, 3556, 448, 2574, 4905, 2562, 5888, 4832, 2682, 5389, 2123, 86, 5387], [2559, 5663, 5514, 2759, 1638, 5485, 5397], [5888, 5077, 3759, 5389, 4763, 187, 5938, 5401, 265, 3655, 1, 276, 5942, 2750, 5883, 1861, 2074, 5389, 4763, 6145, 4771, 4030, 5489], [2532, 2074, 4619, 2156, 890, 1642, 3, 640, 3728, 1, 5397, 2635, 163, 3729, 5483, 5389, 3733, 5718

In [15]:
print(type(padded_train), padded_train.shape, type(label_train), label_train.shape)

<class 'numpy.ndarray'> (24500, 200) <class 'numpy.ndarray'> (24500,)


In [16]:
class SentimentNet(nn.Module):
    # sentiment classifier with single LSTM layer + Fully-connected layer, sigmoid activation and dropout
    # adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self,
                 weight_matrix=None,
                 vocab_size=1000, 
                 output_size=1,  
                 hidden_dim=512,
                 embedding_dim=400, 
                 n_layers=2, 
                 dropout_prob=0.5):
        super(SentimentNet, self).__init__()
        # size of the output, in this case it's one input to one output
        self.output_size = output_size
        # number of layers (default 2) one LSTM layer, one fully-connected layer
        self.n_layers = n_layers
        # dimensions of our hidden state, what is passed from one time point to the next
        self.hidden_dim = hidden_dim
        # initialize the representation to pass to the LSTM
        self.embedding, embedding_dim = self.init_embedding(
            vocab_size, 
            embedding_dim, 
            weight_matrix)
        # LSTM layer, where the magic happens
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        # dropout, similar to regularization
        self.dropout = nn.Dropout(dropout_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        # transform input
        embeds = self.embedding(x)
        # run input embedding + hidden state through model
        lstm_out, hidden = self.lstm(embeds, hidden)
        # reshape
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout certain pct of connections
        out = self.dropout(lstm_out)
        # fully connected layer
        out = self.fc(out)
        # activation function
        out = self.sigmoid(out)
        # reshape
        out = out.view(batch_size, -1)
        out = out[:,-1]
        # return the output and the hidden state
        return out, hidden
    
    def init_embedding(self, vocab_size, embedding_dim, weight_matrix):
        # initializes the embedding
        if weight_matrix is None:
            if vocab_size is None:
                raise ValueError('If no weight matrix, need a vocab size')
            # if embedding is a size, initialize trainable
            return(nn.Embedding(vocab_size, embedding_dim),
                   embedding_dim)
        else:
            # otherwise use matrix as pretrained
            weights = torch.FloatTensor(weight_matrix)
            return(nn.Embedding.from_pretrained(weights),
                  weights.shape[1])
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden

In [17]:
def train_model(model, train_loader, val_loader, model_params, training_params):
    # utility for running the training process
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), 
                                 lr=training_params['learning_rate'])
    epochs = training_params['epochs']
    batch_size = training_params['batch_size']
    # print options
    counter = 0
    print_every = 5
    clip = 5
    valid_loss_min = np.Inf
    #print(training_params['learning_rate'])
    model.train()
    for i in range(epochs):
        h = model.init_hidden(batch_size)
        for inputs, labels in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            output, h = model(inputs, h)
            loss = criterion(output.squeeze(), labels.float())
            #print('loss',loss.item())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            if counter%print_every == 0:
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for inp, lab in val_loader:
                    val_h = tuple([each.data for each in val_h])
                    inp, lab = inp.to(device), lab.to(device)
                    out, val_h = model(inp, val_h)
                    val_loss = criterion(out.squeeze(), lab.float())
                    val_losses.append(val_loss.item())
                    #print('val loss',val_loss.item())

                model.train()
                
                print("Epoch: {}/{}...".format(i+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
                
                if np.mean(val_losses) <= valid_loss_min:
                    torch.save(model.state_dict(), './state_dict.pt')
                    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                    valid_loss_min = np.mean(val_losses)
        #print('counter',counter)
    return(model)
    
def assess_accuracy(model, test_loader, model_params, training_params):
    # utility for assessing accuracy
    batch_size = training_params['batch_size']
    model.load_state_dict(torch.load('./state_dict.pt'))
    h = model.init_hidden(batch_size)
    num_correct = 0
    model.eval()
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model(inputs, h)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        # take the correct labels, check against preds
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        # sum the number of correct
        num_correct += np.sum(correct)
    # calc accuracy
    test_acc = num_correct/len(test_loader.dataset)
    print('LSTM accuracy:', test_acc)
    return test_acc

In [18]:
#construct datasets for loading by PyTorch

train_data = TensorDataset(torch.from_numpy(padded_train), torch.from_numpy(label_train))
val_data = TensorDataset(torch.from_numpy(padded_val), torch.from_numpy(label_val))
test_data = TensorDataset(torch.from_numpy(padded_test), torch.from_numpy(label_test))


In [19]:
glove_vecs.shape

(6180, 300)

In [20]:
#test run once
model_params = {'weight_matrix': glove_vecs,
               'output_size': 1,
               'hidden_dim': 512,
               'n_layers': 2,
               'dropout_prob': 0.001}

print('Start training')

training_params={'learning_rate':.01,
                'batch_size':1000,
                'epochs':10}

batch_size = training_params['batch_size']
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size,
                       drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

model = SentimentNet(**model_params)

trainedmodel = train_model(model, train_loader, val_loader, model_params, training_params)
acc = assess_accuracy(trainedmodel, test_loader, model_params, training_params)

torch.save(trainedmodel, './lstm_model_50k_1000.pt')

Start training
Epoch: 1/10... Step: 5... Loss: 0.736000... Val Loss: 0.693266
Validation loss decreased (inf --> 0.693266).  Saving model ...
Epoch: 1/10... Step: 10... Loss: 0.694029... Val Loss: 0.697492
Epoch: 1/10... Step: 15... Loss: 0.693326... Val Loss: 0.693229
Validation loss decreased (0.693266 --> 0.693229).  Saving model ...
Epoch: 1/10... Step: 20... Loss: 0.693614... Val Loss: 0.696396
Epoch: 2/10... Step: 25... Loss: 0.704118... Val Loss: 0.697240
Epoch: 2/10... Step: 30... Loss: 0.697536... Val Loss: 0.694586
Epoch: 2/10... Step: 35... Loss: 0.693474... Val Loss: 0.698580
Epoch: 2/10... Step: 40... Loss: 0.697458... Val Loss: 0.699062
Epoch: 2/10... Step: 45... Loss: 0.694958... Val Loss: 0.695373
Epoch: 3/10... Step: 50... Loss: 0.693888... Val Loss: 0.692961
Validation loss decreased (0.693229 --> 0.692961).  Saving model ...
Epoch: 3/10... Step: 55... Loss: 0.692918... Val Loss: 0.692197
Validation loss decreased (0.692961 --> 0.692197).  Saving model ...
Epoch: 3/10

  "type " + obj.__name__ + ". It won't be checked "


.754 accuracy in 10 epochs