In [1]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib'}
            
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


from spacy.lang.en import English
en = English()
#!python -m spacy download en_core_web_md
#import en_core_web_md
#nlp = en_core_web_md.load()

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
# this will set the device on which to train
#device = torch.device("cpu")
# if using collab, set your runtime to use GPU and use the line below
device = torch.device("cuda:0")
#Ensure GPU active
print('GPU active', torch.cuda.is_available())

GPU active True


In [2]:
print('GPU active', torch.cuda.is_available())
if not torch.cuda.is_available():
  device = torch.device("cpu")

GPU active True


Tokenizer and some helper methods

In [3]:

#remove punctuation and URLs, and stopwords
def tokenize(text, model=en, nostopwds=True):
   
    tokenlist = []
    doc = model(text)
    ent = ''
    for t in doc:
      
      if nostopwds and t.is_stop:
        #print(t.text)
        continue
      if t.like_url:
        tokenlist.append('URL')
        continue
      if not t.is_alpha:
        continue      
      tokenlist.append(t.lower_)
    return tokenlist
text= "Lol, th? oh @you:all got &amp friend for the d?g ?.. U.S. I'm at a  buffet... Cine there got amore wat... "
print(tokenize(text,nostopwds=False))


def doc_to_index(docs, vocab):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        for w in d:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=300):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features

['lol', 'th', 'oh', 'all', 'got', 'amp', 'friend', 'for', 'the', 'i', 'at', 'a', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


copying Sentiment class, could not get import to work

In [4]:
##Ideally should be a separate class import module but could not get import to work so for now adding directly 
class SentimentNet(nn.Module):
    # sentiment classifier with single LSTM layer + Fully-connected layer, sigmoid activation and dropout
    # adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self,
                 weight_matrix=None,
                 vocab_size=1000, 
                 output_size=1,  
                 hidden_dim=512,
                 embedding_dim=400, 
                 n_layers=2, 
                 dropout_prob=0.5):
        super(SentimentNet, self).__init__()
        # size of the output, in this case it's one input to one output
        self.output_size = output_size
        # number of layers (default 2) one LSTM layer, one fully-connected layer
        self.n_layers = n_layers
        # dimensions of our hidden state, what is passed from one time point to the next
        self.hidden_dim = hidden_dim
        # initialize the representation to pass to the LSTM
        self.embedding, embedding_dim = self.init_embedding(
            vocab_size, 
            embedding_dim, 
            weight_matrix)
        # LSTM layer, where the magic happens
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout_prob, batch_first=True)
        # dropout, similar to regularization
        self.dropout = nn.Dropout(dropout_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        # transform input
        embeds = self.embedding(x)
        # run input embedding + hidden state through model
        lstm_out, hidden = self.lstm(embeds, hidden)
        # reshape
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout certain pct of connections
        out = self.dropout(lstm_out)
        # fully connected layer
        out = self.fc(out)
        # activation function
        out = self.sigmoid(out)
        # reshape
        out = out.view(batch_size, -1)
        out = out[:,-1]
        # return the output and the hidden state
        return out, hidden
    
    def init_embedding(self, vocab_size, embedding_dim, weight_matrix):
        # initializes the embedding
        if weight_matrix is None:
            if vocab_size is None:
                raise ValueError('If no weight matrix, need a vocab size')
            # if embedding is a size, initialize trainable
            return(nn.Embedding(vocab_size, embedding_dim),
                   embedding_dim)
        else:
            # otherwise use matrix as pretrained
            weights = torch.FloatTensor(weight_matrix)
            return(nn.Embedding.from_pretrained(weights),
                  weights.shape[1])
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device))
        return hidden

#Initialize model

In [5]:
#from ModelLSTM import SentimentNet
def initialize_model():
  print('Loading model')
  model = torch.load('trained_lstm.pt')
  
  return model


In [6]:

model = initialize_model()


Loading model




In [7]:
if model is not None:
  print('Model on cuda?',next(model.parameters()).is_cuda)


Model on cuda? True


#Load model vocabulary

In [8]:
with open('vocab_dict_10k.pkl', 'rb') as f:
    vocab = pickle.load(f)

Loading sample data for testing

In [9]:
##prepare tensor for running thru model
with open('clean_tweets_5k.pkl', 'rb') as f:
    df_tweet = pickle.load(f)

df_tweet['Target'] = df_tweet['Target'].replace(4,1)
print(df_tweet['Target'].value_counts())

1    2508
0    2492
Name: Target, dtype: int64


In [10]:
##prepare tensor for running thru model for large dataset

def prepare_data(vocab, df_tweet):

  parsed_text = [tokenize(str(d),nostopwds=False) for d in df_tweet]
  # idx  has indexes of words in vocab dictionary
  #padded pads to 200 length each word sentence if needed
  idx = doc_to_index(parsed_text, vocab)
  padded_text = pad_sequence(idx)
  print('padded text',padded_text.shape)
  tensor_data = torch.from_numpy(padded_text)
  return (tensor_data)

Predict_Multiple Method that can be used in a batch usecase

In [11]:
##This method can take multiple tweets and create output
def predict_multiple(model,vocab, df_tweet):
    # utility for assessing accuracy
    tensor_data = prepare_data(vocab, df_tweet)
    batch_size = tensor_data.size(0)   # can improve logic to do batch_size using all length
    text_loader = DataLoader(tensor_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) 
    for i,inputs in enumerate(text_loader):
        print(i, type(inputs), len(text_loader))
    print('batchsize', batch_size)
    model.eval()
    h = model.init_hidden(batch_size)
    
    print('Start Predicting')
    for i,inputs in enumerate(text_loader):
        h = tuple([each.data for each in h])
        inputs = inputs.to(device)
        output, h = model(inputs, h)
        # takes output, rounds to 0/1
        pred = torch.round(output.squeeze())
        print(pred)

Generate sample data for testing on multiple tweets

In [12]:
idx = np.random.randint(len(df_tweet))

sample_tweet = df_tweet.iloc[idx:idx+5]['text']
sample_tweet
#predict_multiple(model,vocab, sample_tweet)
#prepare_data(df_tweet.iloc[5:10])

26285    @aaronmartirano  so easy to appreciate people ...
29532    Interested in having me write about fashion/sh...
32041                        @unkleEL thanks 4 the follow 
16605    @kattysukamto kesel gw... Konflik ktr is sucks...
22020    Not feeling great today....as it should have b...
Name: text, dtype: object

#Do prediction on multiple tweets

In [13]:
predict_multiple(model, vocab, sample_tweet)

padded text (5, 300)
0 <class 'torch.Tensor'> 1
batchsize 5
Start Predicting
tensor([0., 0., 0., 0., 0.], device='cuda:0', grad_fn=<RoundBackward>)


Method for predicting on a single tweet to be used from a web api

In [14]:
def predict(model,vocab, text, seqlen=300):
  parsed_text = tokenize(str(text),nostopwds=False)
 
  ##create word index
  w_idx = []
  for w in parsed_text:
    if w in vocab:
      w_idx.append(vocab[w])
    else:
      # unknown token = 1
      w_idx.append(1)
       
  #print(parsed_text,'\n',w_idx)
  padded_text = np.zeros((1,seqlen),dtype=int)
  padded_text[0,-len(w_idx):] = np.array(w_idx)[:seqlen]
  #padded_text = pad_sequence(w_idx)
  #print('padded_text', padded_text.shape)
  tensor_data = torch.from_numpy(padded_text)
  batch_size = tensor_data.size(0)
  print('batchsize',batch_size)

  model.eval()
  h = model.init_hidden(batch_size)
  tensor_data = tensor_data.to(device)
  output, h = model(tensor_data, h)
  # takes output, rounds to 0/1
  pred = torch.round(output.squeeze())
  if (pred.item() == 0):
     print('prediction negative sentiment')
  else:
    print('prediction positive sentiment')

#Do prediction on 1 tweet

In [15]:

##Generate test data using existing tweets
idx = np.random.randint(len(df_tweet))
test_tweet = df_tweet.iloc[idx]['text']
print(test_tweet,df_tweet.iloc[idx]['Target'] )
predict(model,vocab, test_tweet)

Damn Friday Mondays  0
batchsize 1
prediction negative sentiment
