In [None]:
# Import libraries for data processing 
import random as rnd
import string
import re
import os
import nltk # Natural language toolkit
nltk.download('twitter_samples') # Sample tweets for training the model
nltk.download('stopwords') # Words that don't affect the meaning of the sentence
from nltk.tokenize import TweetTokenizer 
from nltk.corpus import stopwords, twitter_samples

import pandas as pd
import numpy as np

# Install and Import deep learning library
!pip install -q -U trax 
import trax  
# import trax.layers
from trax import layers as tl

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[K     |████████████████████████████████| 471kB 5.3MB/s 
[K     |████████████████████████████████| 174kB 35.2MB/s 
[K     |████████████████████████████████| 2.6MB 37.9MB/s 
[K     |████████████████████████████████| 71kB 8.4MB/s 
[K     |████████████████████████████████| 3.7MB 39.3MB/s 
[K     |████████████████████████████████| 1.1MB 46.3MB/s 
[K     |████████████████████████████████| 1.4MB 24.4MB/s 
[K     |████████████████████████████████| 348kB 47.4MB/s 
[K     |████████████████████████████████| 2.9MB 48.7MB/s 
[K     |████████████████████████████████| 890kB 49.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


# 1. Data preparation

## 1.1 Loading the data

In [None]:
# Read in positive and negative tweets examples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print(f"The number of positive tweets: {len(positive_tweets)}")
print(f"The number of negative tweets: {len(negative_tweets)}")

The number of positive tweets: 5000
The number of negative tweets: 5000


In [None]:
# Split examples into training (80%) and validation (20%) sets
t = int(len(positive_tweets) * 0.8) # threshold for split

train_pos = positive_tweets[:t]
val_pos = positive_tweets[t:]

train_neg = negative_tweets[:t]
val_neg = negative_tweets[t:]

# Combine training data
train_x = train_pos + train_neg

# Combine validation data
val_x = val_pos + val_neg

# Create labels for training set
train_y = np.concatenate([np.ones(len(train_pos)), np.zeros(len(train_neg))])

# Create labels for validation set
valid_y = np.concatenate([np.ones(len(val_pos)), np.zeros(len(val_neg))])

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

length of train_x 8000
length of val_x 2000


## 1.2 Processing the data

In [None]:
stopwords_english = stopwords.words('english')

def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'http\S+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    # call porter stemmer object
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    # create empty list for clean tweets
    tweets_clean = []
    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    
    return tweets_clean

In [None]:
# Test process_tweet function
print("Original tweet at training position 0")
print(train_pos[0], '\n')

print("Tweet at training position 0 after processing:")
process_tweet(train_pos[0])

Original tweet at training position 0
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) 

Tweet at training position 0 after processing:


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

## 1.3 Build vocabulary

Map each word in each tweet to an integer (an "index"). 

The vocabulary will also include some special tokens:
- `__PAD__` : padding
-  `</e>`: end of line
- `__UNK__`: a token representing any word that is not in the vocabulary.


In [None]:
vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

for tweet in train_x:
  processed_tweet = process_tweet(tweet)
  for word in processed_tweet:
    if word not in vocab:
      vocab[word] = len(vocab)

print("Total words in vocab are",len(vocab))

Total words in vocab are 9168


## 1.4 Convert tweet to tensor

In [None]:
def tweet_to_tensor(tweet, vocab, unk_token='__UNK__'):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''
    # Process tweet into list of words
    word_list = process_tweet(tweet)

    # Initialize tensor list that will contain the unique integer for every word
    tensor_list = []

    # Get unknown token id
    unk_id = vocab[unk_token]

    for word in word_list:
      word_id = vocab.get(word, unk_id) # Get word id, use unk_id if word doesnt exist
      tensor_list.append(word_id) # Append word id to tensor

    return tensor_list

In [None]:
print("Original tweet\n", train_pos[1])
print("\nTensor of tweet:\n", tweet_to_tensor(train_pos[1], vocab=vocab))

Original tweet
 @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!

Tensor of tweet:
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 9, 21, 22]


## 1.5 Create batch generator

The data generator takes in the positive/negative tweets and returns a batch of training examples. It returns the model inputs, the targets (positive or negative labels) and the weight for each target (ex: this allows us to can treat some examples as more important to get right than others, but commonly this will all be 1.0). 


In [None]:
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    '''
    Input: 
        data_pos - Set of posstive examples
        data_neg - Set of negative examples
        batch_size - number of samples per batch. Must be even
        loop - True or False
        vocab_dict - The words dictionary
        shuffle - Shuffle the data order
    Yield:
        inputs - Subset of positive and negative examples
        targets - The corresponding labels for the subset
        example_weights - An array specifying the importance of each example
        
    '''     
    # Make sure the batch size is an even number
    # to allow an equal number of positive and negative samples
    assert batch_size % 2 == 0
    
    # Number of positive examples in each batch is half of the batch size
    # same with number of negative examples in each batch
    n_to_take = batch_size // 2
    
    # Use pos_index to walk through the data_pos array
    # same with neg_index and data_neg
    pos_index = 0
    neg_index = 0
    
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    
    # Get and array with the data indexes
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    
    # Shuffle lines if shuffle is set to True
    if shuffle:
        rnd.shuffle(pos_index_lines)
        rnd.shuffle(neg_index_lines)
        
    stop = False
    
    # Loop indefinitely
    while not stop:  
        
        # Create a batch with positive and negative examples
        batch = []
        
        # First part: Pack n_to_take positive examples
        
        # Start from pos_index and increment i up to n_to_take
        for i in range(n_to_take):
                    
            # If the positive index goes past the positive dataset lenght,
            if pos_index >= len_data_pos: 
                
                # If loop is set to False, break once we reach the end of the dataset
                if not loop:
                    stop = True;
                    break;
                
                # If user wants to keep re-using the data, reset the index
                pos_index = 0
                
                if shuffle:
                    # Shuffle the index of the positive sample
                    rnd.shuffle(pos_index_lines)
                    
            # Get the tweet as pos_index
            tweet = data_pos[pos_index_lines[pos_index]]
            
            # Convert the tweet into tensors of integers representing the processed words
            tensor = tweet_to_tensor(tweet, vocab_dict)
            
            # Append the tensor to the batch list
            batch.append(tensor)
            
            # Increment pos_index by one
            pos_index = pos_index + 1


        # Second part: Pack n_to_take negative examples
    
        # Using the same batch list, start from neg_index and increment i up to n_to_take
        for i in range(n_to_take):
            
            # If the negative index goes past the negative dataset length,
            if neg_index >= len_data_neg:
                
                # If loop is set to False, break once we reach the end of the dataset
                if not loop:
                    stop = True;
                    break;
                    
                # If user wants to keep re-using the data, reset the index
                neg_index = 0
                
                if shuffle:
                    # Shuffle the index of the negative sample
                    rnd.shuffle(neg_index_lines)
            # Get the tweet as neg_index
            tweet = data_neg[neg_index_lines[neg_index]]
            
            # Convert the tweet into tensors of integers representing the processed words
            tensor = tweet_to_tensor(tweet, vocab_dict)
            
            # Append the tensor to the batch list
            batch.append(tensor)
            
            # Increment neg_index by one
            neg_index += 1


        if stop:
            break;

        # Update the start index for positive data 
        # so that it's n_to_take positions after the current pos_index
        pos_index += n_to_take
        
        # Update the start index for negative data 
        # so that it's n_to_take positions after the current neg_index
        neg_index += n_to_take
        
        # Get the max tweet length (the length of the longest tweet) 
        # (you will pad all shorter tweets to have this length)
        max_len = max([len(t) for t in batch]) 
        
        # Initialize the input_l, which will 
        # store the padded versions of the tensors
        tensor_pad_l = []
        # Pad shorter tweets with zeros
        for tensor in batch:

            # Get the number of positions to pad for this tensor so that it will be max_len long
            n_pad = max_len - len(tensor)
            
            # Generate a list of zeros, with length n_pad
            pad_l = [0] * n_pad
            
            # Concatenate the tensor and the list of padded zeros
            tensor_pad = tensor + pad_l
            
            # Append the padded tensor to the list of padded tensors
            tensor_pad_l.append(tensor_pad)

        # Convert the list of padded tensors to a numpy array
        # and store this as the model inputs
        inputs = np.array(tensor_pad_l)
  
        # Generate the list of targets for the positive examples (a list of ones)
        # The length is the number of positive examples in the batch
        target_pos = [1] * n_to_take
        
        # Generate the list of targets for the negative examples (a list of zeros)
        # The length is the number of negative examples in the batch
        target_neg = [0] * n_to_take
        
        # Concatenate the positve and negative targets
        target_l = target_pos + target_neg
        
        # Convert the target list into a numpy array
        targets = np.array(target_l)

        # Example weights: Treat all examples equally importantly.
        example_weights = np.ones_like(targets)
        

        yield inputs, targets, example_weights

In [None]:
# Set the random number generator for the shuffle procedure
rnd.seed(30) 

# Create the training data generator
def train_generator(batch_size, shuffle = False):
    return data_generator(train_pos, train_neg, batch_size, True, vocab, shuffle)

# Create the validation data generator
def val_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, True, vocab, shuffle)

# Create the validation data generator (without indefinite looping)
def test_generator(batch_size, shuffle = False):
    return data_generator(val_pos, val_neg, batch_size, False, vocab, shuffle)

# Get a batch from the train_generator and inspect.
inputs, targets, example_weights = next(train_generator(4, shuffle=True))

# this will print a list of 4 tensors padded with zeros
print(f'Inputs: {inputs}')
print(f'Targets: {targets}')
print(f'Example Weights: {example_weights}')

Inputs: [[2030 4492 3231    9    0    0    0    0    0    0    0]
 [5009  571 2025 1475 5233 3532  142 3532  132  464    9]
 [3798  111   96  587 2960 4007    0    0    0    0    0]
 [ 256 3798    0    0    0    0    0    0    0    0    0]]
Targets: [1 1 0 0]
Example Weights: [1 1 1 1]


In [None]:
# Test the train_generator

# Create a data generator for training data,
# which produces batches of size 4 (for tensors and their respective targets)
tmp_data_gen = train_generator(batch_size = 4)

# Call the data generator to get one batch and its targets
tmp_inputs, tmp_targets, tmp_example_weights = next(tmp_data_gen)

print(f"The inputs shape is {tmp_inputs.shape}")
print(f"The targets shape is {tmp_targets.shape}")
print(f"The example weights shape is {tmp_example_weights.shape}")

for i,t in enumerate(tmp_inputs):
    print(f"input tensor: {t}; target {tmp_targets[i]}; example weights {tmp_example_weights[i]}")

The inputs shape is (4, 14)
The targets shape is (4,)
The example weights shape is (4,)
input tensor: [3 4 5 6 7 8 9 0 0 0 0 0 0 0]; target 1; example weights 1
input tensor: [10 11 12 13 14 15 16 17 18 19 20  9 21 22]; target 1; example weights 1
input tensor: [5807 2931 3798    0    0    0    0    0    0    0    0    0    0    0]; target 0; example weights 1
input tensor: [ 865  261 3689 5808  313 4499  571 1248 2795  333 1220 3798    0    0]; target 0; example weights 1


# 2. Implement model

We will create a classifier using artificial neural networks.
For this implementation, we will use Trax library and its Serial combinator, which allows us to execute one layer after another.

For example: tl.Serial(tl.Embeddings(...), tl.Mean(...), tl.Dense(...), tl.LogSoftmax(...))

In [None]:
def classifier(vocab_size=len(vocab), embedding_dim=256, output_dim=2, mode='train'):

    # Create embedding layer
    embed_layer = tl.Embedding(vocab_size=vocab_size, 
                               d_feature=embedding_dim)

    # Create mean layer
    mean_layer = tl.Mean(axis=1) # takes the average for word embedding

    # Create dense layer
    dense_output_layer = tl.Dense(n_units=output_dim)

    # Create LogSoftmax layer
    log_softmax_layer = tl.LogSoftmax()

    # Combine all layers
    model = tl.Serial(
              embed_layer,
              mean_layer,
              dense_output_layer,
              log_softmax_layer
    )

    return model

In [None]:
temp_model = classifier()
print(type(temp_model))
display(temp_model)

<class 'trax.layers.combinators.Serial'>


Serial[
  Embedding_9168_256
  Mean
  Dense_2
  LogSoftmax
]

# 3. Training 


## 3.1 Training the neural network model

In [None]:
from trax.supervised import training

batch_size = 16
rnd.seed(200)

# Define training task
train_task = training.TrainTask(
    labeled_data = train_generator(batch_size=batch_size, shuffle=True),
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint = 10,
)

# Define evaluation task
eval_task = training.EvalTask(
    labeled_data = val_generator(batch_size=batch_size, shuffle=True),
    metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
)

# Define classifier model
model = classifier()





In [None]:
output_dir = '~/model/'
output_dir_expand = os.path.expanduser(output_dir)
print(output_dir_expand)

/root/model/


In [None]:
def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    '''
    Input: 
        classifier - the model you are building
        train_task - Training task
        eval_task - Evaluation task
        n_steps - the evaluation steps
        output_dir - folder to save your files
    Output:
        trainer -  trax trainer
    '''

    train_loop = training.Loop(
          classifier,
          train_task,
          eval_tasks=[eval_task],
          output_dir=output_dir
    )

    train_loop.run(n_steps)

    return train_loop

In [None]:
training_loop = train_model(model, train_task, eval_task, 100, output_dir_expand)


Step      1: Total number of trainable weights: 2347522
Step      1: Ran 1 train steps in 1.63 secs
Step      1: train CrossEntropyLoss |  0.70103645
Step      1: eval  CrossEntropyLoss |  0.70262021
Step      1: eval          Accuracy |  0.43750000

Step     10: Ran 9 train steps in 8.06 secs
Step     10: train CrossEntropyLoss |  0.63601637
Step     10: eval  CrossEntropyLoss |  0.56654179
Step     10: eval          Accuracy |  0.81250000

Step     20: Ran 10 train steps in 2.91 secs
Step     20: train CrossEntropyLoss |  0.44222397
Step     20: eval  CrossEntropyLoss |  0.21864718
Step     20: eval          Accuracy |  1.00000000

Step     30: Ran 10 train steps in 2.61 secs
Step     30: train CrossEntropyLoss |  0.20902090
Step     30: eval  CrossEntropyLoss |  0.08811120
Step     30: eval          Accuracy |  1.00000000

Step     40: Ran 10 train steps in 0.87 secs
Step     40: train CrossEntropyLoss |  0.09095457
Step     40: eval  CrossEntropyLoss |  0.03796640
Step     40: eva

## 3.2 Practice prediction making

In [None]:
# Create generator object
tmp_train_generator = train_generator(batch_size=16)

# Generate one batch of 16 embeddings
tmp_batch = next(tmp_train_generator)

# Get inputs, targets and weights from batch
tmp_inputs, tmp_targets, tmp_weights = tmp_batch

print(f"The shape of the tweet tensors is {tmp_inputs.shape} (num of examples, length of tweet tensors)")
print(f"The shape of the labels is {tmp_targets.shape}, which is the batch size.")
print(f"The shape of the example_weights is {tmp_example_weights.shape}, which is the same as inputs/targets size.")

The shape of the tweet tensors is (16, 15) (num of examples, length of tweet tensors)
The shape of the labels is (16,), which is the batch size.
The shape of the example_weights is (4,), which is the same as inputs/targets size.


In [None]:
# Feed the tweet tensors into the model to get a prediction

tmp_pred = training_loop.eval_model(tmp_inputs)

print(f"The prediction shape is {tmp_pred.shape}, num of tensor_tweets as rows")
print("Column 0 is the probability of a negative sentiment (class 0)")
print("Column 1 is the probability of a positive sentiment (class 1)")
print()
print("View the prediction array")
tmp_pred

The prediction shape is (16, 2), num of tensor_tweets as rows
Column 0 is the probability of a negative sentiment (class 0)
Column 1 is the probability of a positive sentiment (class 1)

View the prediction array


DeviceArray([[-1.1202906e+01, -1.3351440e-05],
             [-1.0113113e+01, -4.0531158e-05],
             [-5.9028568e+00, -2.7353764e-03],
             [-7.9232450e+00, -3.6239624e-04],
             [-4.1797781e+00, -1.5420198e-02],
             [-7.2388878e+00, -7.1835518e-04],
             [-8.6851482e+00, -1.6927719e-04],
             [-7.3744698e+00, -6.2727928e-04],
             [-3.2093525e-03, -5.7433105e+00],
             [-6.3610077e-04, -7.3603740e+00],
             [-1.3709068e-03, -6.5930486e+00],
             [-5.2452087e-06, -1.2121555e+01],
             [-5.0520897e-04, -7.5907302e+00],
             [-1.7967224e-03, -6.3226938e+00],
             [-4.5390129e-03, -5.3973246e+00],
             [-4.0984154e-04, -7.7998676e+00]], dtype=float32)

In [None]:
tmp_is_positive = tmp_pred[:, 1] > tmp_pred[:, 0]

for i, p in enumerate(tmp_is_positive):
    print(f"Neg log prob {tmp_pred[i,0]:.4f}\tPos log prob {tmp_pred[i,1]:.4f}\t is positive? {p}\t actual {tmp_targets[i]}")

Neg log prob -11.2029	Pos log prob -0.0000	 is positive? True	 actual 1
Neg log prob -10.1131	Pos log prob -0.0000	 is positive? True	 actual 1
Neg log prob -5.9029	Pos log prob -0.0027	 is positive? True	 actual 1
Neg log prob -7.9232	Pos log prob -0.0004	 is positive? True	 actual 1
Neg log prob -4.1798	Pos log prob -0.0154	 is positive? True	 actual 1
Neg log prob -7.2389	Pos log prob -0.0007	 is positive? True	 actual 1
Neg log prob -8.6851	Pos log prob -0.0002	 is positive? True	 actual 1
Neg log prob -7.3745	Pos log prob -0.0006	 is positive? True	 actual 1
Neg log prob -0.0032	Pos log prob -5.7433	 is positive? False	 actual 0
Neg log prob -0.0006	Pos log prob -7.3604	 is positive? False	 actual 0
Neg log prob -0.0014	Pos log prob -6.5930	 is positive? False	 actual 0
Neg log prob -0.0000	Pos log prob -12.1216	 is positive? False	 actual 0
Neg log prob -0.0005	Pos log prob -7.5907	 is positive? False	 actual 0
Neg log prob -0.0018	Pos log prob -6.3227	 is positive? False	 actual

In [None]:
# Convert tmp_is_positive to integer values
tmp_is_positive_int = tmp_is_positive.astype('int32')

print(f'Array of predictions: \n{tmp_is_positive_int}')

Array of predictions: 
[1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0]


# 4. Evaluation

## 4.1 Computing accuracy on a single batch

In [None]:
def compute_accuracy(preds, y, y_weights):
    """
    Input: 
        preds: a tensor of shape (dim_batch, output_dim) 
        y: a tensor of shape (dim_batch, output_dim) with the true labels
        y_weights: a n.ndarray with the a weight for each example
    Output: 
        accuracy: a float between 0-1 
        weighted_num_correct (np.float32): Sum of the weighted correct predictions
        sum_weights (np.float32): Sum of the weights
    """
    # Create boolean array - True if tweet is positive, False if negative
    is_pos = preds[:, 1] > preds[:, 0]
    
    # Convert to np.int32 type
    is_pos_int = is_pos.astype(np.int32)

    # Compare array of predictions with true labels
    correct_preds = (is_pos_int == y) 

    # Calculate the sum of weights
    sum_weights = np.sum(y_weights)

    # Convert array of correct predictions to float 
    correct_float = correct_preds.astype(np.float32)

    # Multiply predictions with corresponding weights
    weighted_correct_float = np.multiply(correct_float, y_weights)

    # Sum correct weighted predictions
    weighted_num_correct = np.sum(weighted_correct_float)

    # Compute accuracy
    accuracy = weighted_num_correct / sum_weights

    return accuracy, weighted_num_correct, sum_weights



In [None]:
# test your function
tmp_val_generator = val_generator(64)

# get one batch
tmp_batch = next(tmp_val_generator)
tmp_inputs, tmp_targets, tmp_example_weights = tmp_batch

# feed the tweet tensors into the model to get a prediction
tmp_pred = training_loop.eval_model(tmp_inputs)

tmp_acc, tmp_num_correct, tmp_num_predictions = compute_accuracy(preds=tmp_pred, y=tmp_targets, y_weights=tmp_example_weights)

print(f"Model's prediction accuracy on a single training batch is: {100 * tmp_acc}%")
print(f"Weighted number of correct predictions {tmp_num_correct}; weighted number of total observations predicted {tmp_num_predictions}")

Model's prediction accuracy on a single training batch is: 100.0%
Weighted number of correct predictions 64.0; weighted number of total observations predicted 64


## 4.2 Compute accuracy on validation set

In [None]:
def test_model(generator, model):
    '''
    Input: 
        generator: an iterator instance that provides batches of inputs and targets
        model: a model instance 
    Output: 
        accuracy: float corresponding to the accuracy
    '''
    
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    
    for batch in generator: 
        
        # Retrieve the inputs from the batch
        inputs = batch[0]
        
        # Retrieve the targets (actual labels) from the batch
        targets = batch[1]
        
        # Retrieve the example weight.
        example_weight = batch[2]

        # Make predictions using the inputs
        pred = training_loop.eval_model(inputs)
        
        # Calculate accuracy for the batch by comparing its predictions and targets
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight)
        
        # Update the total number of correct predictions
        # by adding the number of correct predictions from this batch
        total_num_correct += batch_num_correct
        
        # Update the total number of predictions 
        # by adding the number of predictions made for the batch
        total_num_pred += batch_num_pred

    # Calculate accuracy over all examples
    accuracy = total_num_correct / total_num_pred
    
    return accuracy

In [None]:
model = training_loop.eval_model
accuracy = test_model(test_generator(16), model)

print(f'The accuracy of the model on the validation set is {accuracy:.4f}', )

The accuracy of the model on the validation set is 0.9950


In [None]:
a = np.array([1,2,3])
a[None, :]

array([[1, 2, 3]])

# 5. Testing on own input

## 5.1 Predict sentiment 

In [None]:
def predict(sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab))
    
    # Batch size 1, add dimension for batch, to work with the model
    inputs = inputs[None, :]  
    
    # Predict with the model
    preds_probs = model(inputs)
    
    # Turn probabilities into categories
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment

In [None]:
# Try a positive sentence
sentence = "The movie was great"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()
# Try a negative sentence
sentence = "I am  uncomfortable"
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")#

The sentiment of the sentence 
***
"The movie was great"
***
is positive.

The sentiment of the sentence 
***
"I am  uncomfortable"
***
is negative.


## 5.2 Load own dataset

In [None]:
# Read csv file
twitter_df = pd.read_csv('/twitter.csv')

# Convert tweets column to Numpy array
tweets = np.array(twitter_df['tweet'])


In [None]:
# Preview dataset
twitter_df.head()

Unnamed: 0,tweet_id,user_id,tweet,datetime
0,1335684285915930630,760631850662494209,@yepicturepalace @GoingMedieval I love how lit...,2020-12-06 20:35:24
1,1335684305507401728,1221520201008603137,@CBSNews Translation: Welfare queen elon musk...,2020-12-06 20:35:28
2,1335684309143851009,20352228,@BittrexGlobal @Apple @Tesla @amazon When will...,2020-12-06 20:35:29
3,1335684341553377280,581786786,@TeslaChillMode @Tesla @elonmusk Is it a priva...,2020-12-06 20:35:37
4,1335684360473899015,16306374,@Thoug4Thoughts @AliAbdaal @Tesla Haters gonna...,2020-12-06 20:35:42


In [None]:
# Create empty arrays to store predictions
predictions = np.empty_like(tweets)
sentiments = np.empty_like(tweets)

# Predict sentiment
for i in range(twitter_df.shape[0]):
    try:
      predictions[i], sentiments[i] = predict(tweets[i])
    except:
      predictions[i], sentiments[i] = np.nan, np.nan

# Create dataframe with tweets and sentiment predictions
tweets_sentiment = pd.DataFrame({
    'tweet': tweets,
    'prediction': predictions,
    'sentiment': sentiments
})

In [None]:
# Tweet sentiment breakdown
tweets_sentiment.sentiment.value_counts()

negative    1386
positive    1215
Name: sentiment, dtype: int64