# Network Sentiment Inference Pipeline


In [1]:
import pandas as pd
from tqdm import tqdm_notebook
import json
import nltk
import os
import random
import re
import torch
import numpy as np
import pickle

from torch import nn, optim
import torch.nn.functional as F

# Load Network Data

In [2]:
#Load network tweets
df_net = pd.read_csv("df_tweet_r2.csv")
df_net.head()

Unnamed: 0,date,tweet,id,lang
0,2019-04-15 15:58:36,ConsenSys is reportedly trying to attract outs...,Cointelegraph,en
1,2019-04-15 15:11:55,Don't miss the most memorable quotes from last...,Cointelegraph,en
2,2019-04-15 15:09:47,Portland State University researchers have pre...,Cointelegraph,en
3,2019-04-15 14:01:08,Reuters: France to ask other EU states to adop...,Cointelegraph,en
4,2019-04-15 13:11:07,Ethereum engineer Lukas Hohl has joined Swissc...,Cointelegraph,en


In [3]:
df_net.describe()

Unnamed: 0,date,tweet,id,lang
count,4983,4960,4983,4921
unique,4910,4918,129,32
top,2019-04-16 13:24:07,A rich life: 1. You and your loved ones are he...,evankirstel,en
freq,3,3,897,4573


In [4]:
df_net_eng = df_net[df_net['lang']=='en'].copy()

# Load Vocab from Model

In [5]:
with open('vocab_r1.pickle', 'rb') as handle:
    vocab = pickle.load(handle)

In [6]:
#nltk.download('wordnet')
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    #TODO: Implement 

    # Lowercase the twit message
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub(r"http\S+", " ", text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub(r"\$\S+", " ", text)
    
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    text = re.sub(r"\@\S+", " ", text)

    # Replace everything not a letter with a space
    text = re.sub(r"[^a-z]+", " ", text)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = text.split()
    
    # Remove stop words
    tokens = [w for w in tokens if not w in stop_words]

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(k, pos ="v") for k in tokens]
    
    return tokens

# Load Model

In [7]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [8]:
#Load device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# Define Model Class
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, lstm_size, output_size, lstm_layers=2, dropout=0.1):
        """
        Initialize the model by setting up the layers.
        Parameters
        ----------
            vocab_size : The vocabulary size.
            embed_size : The embedding layer size.
            lstm_size : The LSTM layer size.
            output_size : The output size.
            lstm_layers : The number of LSTM layers.
            dropout : The dropout probability.
        """
    
        super(TextClassifier, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.output_size = output_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout
        
        # TODO Implement
        
        # Setup embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # Setup LSTM layer
        # Reviewer advises batch_first=False as input is tuple of (seq_len,batch_size)
        self.lstm = nn.LSTM(embed_size, lstm_size, lstm_layers, dropout = dropout, batch_first=False)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear layer
        self.fc = nn.Linear(lstm_size, output_size)
        
        # sigmoid layer
        self.sig = nn.LogSoftmax(dim=1)


    def init_hidden(self, batch_size):
        """ 
        Initializes hidden state
        Parameters
        ----------
            batch_size : The size of batches.
        Returns
        -------
            hidden_state   
        """
        # Initialize in CPU, then move to GPU for training
            
        # TODO Implement 
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda(),
                  weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda())
        else:
            hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
                      weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
            
        # Reviewer recommends following function to switch CPU to GPU for  training    
        for each in hidden:
            each.to(device)    
        
        return hidden


    def forward(self, nn_input, hidden):
        """
        Perform a forward pass of our model on nn_input.
        Parameters
        ----------
            nn_input : The batch of input to the NN.
            hidden_state : The LSTM hidden state.
        Returns
        -------
            logps: log softmax output
            hidden_state: The new hidden state.
        """
        # TODO Implement 
        batch_size = nn_input.size(0)

        # embeddings and lstm_out
        #nn_input = nn_input.long()
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        #lstm_out = lstm_out.contiguous().view(-1, self.lstm_size)
        # we are using a softmax layer we simplify implementation
        # reviewer recommends following format
        lstm_out = lstm_out[-1,:,:]
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # log sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        # reviewer recommends removing
        # sig_out = sig_out.view(batch_size, -1)
        # sig_out = sig_out[:,-5:] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden

In [10]:
# Instantiate Model
model = TextClassifier(len(vocab)+1, 400, 256, 5, lstm_layers=2, dropout=0.2)

pretrained_dict = torch.load("text_class_040819.pth", map_location=lambda storage, loc: storage)
model_dict = model.state_dict()

#pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}

model_dict.update(pretrained_dict)

model.load_state_dict(pretrained_dict)

model.eval()

TextClassifier(
  (embedding): Embedding(20939, 400)
  (lstm): LSTM(400, 256, num_layers=2, dropout=0.2)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=5, bias=True)
  (sig): LogSoftmax()
)

## Make Predictions

In [11]:
# Model Prediction Routine
def predict(text, model, vocab):
    """ 
    Make a prediction on a single sentence.
    Parameters
    ----------
        text : The string to make a prediction on.
        model : The model to use for making the prediction.
        vocab : Dictionary for word to word ids. The key is the word and the value is the word id.
    Returns
    -------
        pred : Prediction vector
    """    
    
    # TODO Implement
    # Clean sentance and tokenize
    tokens = preprocess(text)
    
    # Filter non-vocab words
    tokens = [w for w in tokens if w in vocab]
    
    # Convert words to ids
    token_ids = [vocab[w] for w in tokens]
    
    # Test length of token_ids is > zero
    
    if len(token_ids) > 0:
        
        # Adding a batch dimension
        text_input = torch.LongTensor([0]*(12-len(tokens)) + token_ids)
        
        # Init Hidden
        hidden = model.init_hidden(1)
        
        # Get the model info
        logps, _ = model(text_input.unsqueeze(1), hidden)
        
        # Convert to probabilities
        pred = torch.exp(logps)
        
    else:
        # Indeterminant
        pred = torch.LongTensor([[0,0,0,0,0]])
    
    return pred

In [12]:
# try a sample
test_text = df_net_eng['tweet'][2]
test_text

'Portland State University researchers have presented a blockchain protocol to fight counterfeit pharmaceuticals https://cointelegraph.com/news/us-researchers-develop-blockchain-protocol-to-fight-counterfeit-pharmaceuticals …pic.twitter.com/AI4MGNWKnE'

In [13]:
# sample results
test_result = predict(test_text, model, vocab)
test_result

tensor([[0.0221, 0.0752, 0.5878, 0.2892, 0.0258]], grad_fn=<ExpBackward>)

In [14]:
#Iterate through all the tweets

class_0 = []
class_1 = []
class_2 = []
class_3 = []
class_4 = []

for i,tweet in tqdm_notebook(df_net_eng.iterrows()):
    try:
        #get tweet
        tweet = df_net_eng['tweet'][i]

        #predict
        tweet_result = predict(tweet, model, vocab)

        #convert tensor to np array
        results_np = tweet_result.detach().numpy().tolist()
        
    except:
        results_np = np.asarray([[0,0,0,0,0]]).tolist()

    #append to dataframes
    class_0.append(results_np[0][0])
    class_1.append(results_np[0][1])
    class_2.append(results_np[0][2])
    class_3.append(results_np[0][3])
    class_4.append(results_np[0][4])

# write lists to dataframe
df_net_eng['sen_0'] = class_0
df_net_eng['sen_1'] = class_1
df_net_eng['sen_2'] = class_2
df_net_eng['sen_3'] = class_3
df_net_eng['sen_4'] = class_4
    
df_net_eng.info()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4573 entries, 0 to 4982
Data columns (total 9 columns):
date     4573 non-null object
tweet    4573 non-null object
id       4573 non-null object
lang     4573 non-null object
sen_0    4573 non-null float64
sen_1    4573 non-null float64
sen_2    4573 non-null float64
sen_3    4573 non-null float64
sen_4    4573 non-null float64
dtypes: float64(5), object(4)
memory usage: 517.3+ KB


In [15]:
# view results
df_net_eng.head()

Unnamed: 0,date,tweet,id,lang,sen_0,sen_1,sen_2,sen_3,sen_4
0,2019-04-15 15:58:36,ConsenSys is reportedly trying to attract outs...,Cointelegraph,en,0.046712,0.061868,0.649117,0.18982,0.052482
1,2019-04-15 15:11:55,Don't miss the most memorable quotes from last...,Cointelegraph,en,0.013756,0.147028,0.751772,0.079675,0.007769
2,2019-04-15 15:09:47,Portland State University researchers have pre...,Cointelegraph,en,0.022057,0.075189,0.587784,0.289166,0.025804
3,2019-04-15 14:01:08,Reuters: France to ask other EU states to adop...,Cointelegraph,en,0.017009,0.063318,0.69627,0.203236,0.020167
4,2019-04-15 13:11:07,Ethereum engineer Lukas Hohl has joined Swissc...,Cointelegraph,en,0.00645,0.02855,0.670341,0.278097,0.016562


In [16]:
# Copy results to file for indexing to search engine
df_net_eng.to_csv("df_tweet_net_eng_sen_r2.csv", index=False, encoding='utf-8')