Notebook to build a baseline model to perform binary class sentiment analysis. Model uses pretrained glove twitter embeddings. 

In [2]:
#specify python install location above /home/ec2-user/SageMaker so that we don't have to 
#reinstall custom packages after every reboot.

import sys
sys.path.insert(0, '/home/ec2-user/SageMaker/.local/lib/python3.6/site-packages')
import site
site.USER_BASE='/home/ec2-user/SageMaker/.local'

In [8]:
#!export PYTHONUSERBASE=/home/ec2-user/SageMaker/.local; pip install --quiet --user --timeout 60 torchtext==0.3.1 spacy==2.1.4
#!export PYTHONUSERBASE=/home/ec2-user/SageMaker/.local; python3 -m spacy download en

#!export PYTHONUSERBASE=/home/ec2-user/SageMaker/.local; pip list | grep torch
#from torchtext import data
#sys.path
#site.USER_BASE

from sagemaker import get_execution_role

role = get_execution_role()

Couldn't call 'get_role' to get Role ARN from role name AWSGlueServiceSageMakerNotebookRole-airline-sentiment to get Role path.


In [7]:
'''
Binary classifier
Reworked from examples here to play with torchtext
https://github.com/bentrevett/pytorch-sentiment-analysis
'''

import pandas as pd
import torch
import torch.nn as nn
import pickle
import sys

import sagemaker
import boto3
import os
import zipfile
import io

model_dir="../model/"
model_file='rnn_binary_pretrain_model.pt'
data_dir="../data/"

sess = sagemaker.Session()
bucket = sess.default_bucket()
s3_raw_dir='raw'
raw_data_file='Tweets.csv'
s3_intermediate_dir='intermediate'
s3_master_dir='master'

s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key=os.path.join(s3_raw_dir, raw_data_file))

sentiments = pd.read_csv(io.BytesIO(obj['Body'].read()))

ClientError: An error occurred (AccessDenied) when calling the CreateBucket operation: Access Denied

Cleaning and processing data. Remove null text and only use tweet, airline, label (positive and negative only) and tweet id to build a binary classifier. Split dataset into train, test, validation

In [None]:
from torchtext import data
from sklearn.model_selection import train_test_split

clean_df = sentiments[sentiments['text'].notnull() &
                      sentiments['airline'].notnull() &
                      sentiments['airline_sentiment'].notnull() &
                      sentiments['tweet_id'].notnull()]
# use only tweet(text), airline, label (airline_sentiment) and tweet id
final_df = clean_df.filter(['tweet_id', 'text', 'airline',
                           'airline_sentiment'], axis=1)
# use only positive and negative sentiment
row_vals = ['positive', 'negative']
final_df = final_df.loc[final_df['airline_sentiment'].isin(row_vals)]
# split into train, test, val (.7, .15, .15)
train_df, testval_df = train_test_split(final_df, test_size=0.3)
test_df, val_df = train_test_split(testval_df, test_size=0.5)

Convert dataframes back to csv so we can load into Torchtext easily

In [None]:
# convert df back to csv, with column names
train_df.to_csv(data_dir+'/train.csv', index=False)
test_df.to_csv(data_dir+'/test.csv', index=False)
val_df.to_csv(data_dir+'/val.csv', index=False)

# load into torchtext
ID = data.Field()
TEXT = data.Field(tokenize='spacy')
SENTIMENT = data.LabelField(dtype=torch.float)
AIRLINE = data.Field()

# access using batch.id, batch.text etc
fields = [('id', ID), ('text', TEXT), ('airline', AIRLINE), ('label', SENTIMENT)]
train_data, valid_data, test_data = data.TabularDataset.splits(path=data_dir,
                                                               train='train.csv',
                                                               validation='val.csv',
                                                               test='test.csv',
                                                               format='csv',
                                                               fields=fields,
                                                               skip_header=True)

In [None]:
#Download of vector cache files can be slow.  Pre-download to s3 default bucket
#Source: http://nlp.stanford.edu/data/glove.twitter.27B.zip

import sagemaker
import boto3
import os
import zipfile

sess = sagemaker.Session()
bucket = sess.default_bucket()
vec_file_prefix = ''
#vec_file='glove.twitter.27B.zip'
vec_file='glove.twitter.27B.25d.txt'
dest_dir='.vector_cache/'
src_vec_file=os.path.join(vec_file_prefix, vec_file)
dest_vec_file=os.path.join(dest_dir, vec_file)

if not os.path.exists(dest_dir):
    os.mkdir(dest_dir)

if not os.path.exists(dest_vec_file):
    s3 = boto3.resource('s3')
    s3.meta.client.download_file(bucket, src_vec_file, dest_vec_file)

#if not os.path.exists('.vector_cache/glove.twitter.27B.25d.txt'):
#    with zipfile.ZipFile(dest_vec_file, 'r') as zip_ref:
#        zip_ref.extractall(dest_dir)

In [None]:
# build iterators
MAX_VOCAB_SIZE = 10_000

ID.build_vocab(train_data)
# TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.twitter.27B.25d",
                 unk_init=torch.Tensor.normal_)
SENTIMENT.build_vocab(train_data)
AIRLINE.build_vocab(train_data)

print(TEXT.vocab.freqs.most_common(20))

In [None]:
# save this - need for model prediction
vocab_file='vocab_index.pkl'
outfile = open(model_dir+vocab_file, 'wb')
pickle.dump(TEXT.vocab.stoi, outfile, -1)
outfile.close()

s3 = boto3.client('s3')
obj = s3.put_object(Bucket=bucket, Key=os.path.join(s3_intermediate_dir, vocab_file), Body=open(model_dir+vocab_file, 'rb'))


Make data iterators so we can load batches properly

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: x.text,  # sort by text
    batch_size=BATCH_SIZE,
    device=device)


Simple RNN model 

In [None]:
# model
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
       
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
       
    def forward(self, text):

        # text = [sent len, batch size]
        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)
        # output = [sent len, batch size, hid dim]
        # hidden = [1, batch size, hid dim]
        assert torch.equal(output[-1, :, :], hidden.squeeze(0))

        return self.fc(hidden.squeeze(0)), hidden


Set model parameters and instantiate the RNN. The input dimension is the length of the vocabulary (10,000), the embedding dimension is the length of the glove pretrained vector (25). The hidden dimension is the size of the hidden layer, the output dimension is 1.

In [None]:
INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 100
EMBEDDING_DIM = 25
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


Copy pretrained vector into the embedding layer

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)


Define training and evaluation functions. 

In [None]:
import torch.optim as optim

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        #predictions = model(batch.text).squeeze(1)
        predictions, _ = model(batch.text)
        predictions = predictions.squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
   
    model.eval()
   
    with torch.no_grad():
   
        for batch in iterator:

            #predictions = model(batch.text).squeeze(1)
            predictions, _ = model(batch.text)
            predictions = predictions.squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Define helper functions

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

Set optimizer and training criterion for training.

In [None]:
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_dir+model_file)
        
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
boto3.client('s3').put_object(Bucket=bucket, 
                              Key=os.path.join(s3_master_dir, model_file), 
                              Body=open(model_dir+model_file, 'rb'))
        
model.load_state_dict(torch.load(model_dir+model_file))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


Return prediction, embedding, tweet and airline for front end using test dataset.

In [None]:
def predict_sentiment_from_dataset(model, tokenized):
    model.eval()
    # tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    # print(tensor)
    sentiment, hidden = model(tensor)
    prediction = torch.sigmoid(sentiment)
    return prediction.item(), hidden


In [None]:
# save for front-end application
prediction_list = []
embedding_list = []
airline_list = []
tweet_list = []
for example in test_data:
    text = example.text  # this is tokenized
    airline = example.airline
    prediction, embedding = predict_sentiment_from_dataset(model, text)
    tweet_list.append(text)
    prediction_list.append(prediction)
    embedding_list.append(embedding.data.numpy().squeeze(1))
    airline_list.append(airline)

output_dict = {"prediction": prediction_list,
               "embedding": embedding_list,
               "tweet": tweet_list,
               "airline": airline_list}

frontend_file='frontend_data'
outfile = open(data_dir+frontend_file, 'wb')
pickle.dump(output_dict, outfile, -1)
outfile.close()
boto3.client('s3').put_object(Bucket=bucket, 
                              Key=os.path.join(s3_master_dir, frontend_file), 
                              Body=open(data_dir+frontend_file, 'rb'))
