# Acknowledgements

The LSTM baseline model was run against AWS Sagemaker. To create the baseline model, we adapted the following tutorial that demonstrates a pytorch implementation of LSTMs in the Sagemaker environment. Initial data loading and preprocessimng steps were modified to account for the structure of our training data. The hyperparameters we experimented with were (1) whether STOP words were included in the vocabulary (2) word embedding dimension and (3) epochs. The best micro F1 score optained was 0.12

Reference https://github.com/danwild/sagemaker-sentiment-analysis/blob/master/SageMaker%20Project.ipynb

In [79]:
import os
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import numpy as np
import pickle
import sagemaker
import torch
import torch.utils.data
import torch.optim as optim
from train.model_lstm import LSTMClassifier
from sagemaker.pytorch import PyTorch
import collections
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [80]:
df = pd.read_csv('./data/L3train_nonOneHot_7topics.csv')
list(df.category.unique())

['pro-choice',
 'pro-immigration',
 'pro-guns',
 'anti-spending',
 'anti-immigration',
 'oppose-medicare',
 'pro-life',
 'pro-free',
 'pro-medicare',
 'tax-wealthy',
 'anti-tax-wealthy',
 'anti-guns',
 'anti-free',
 'pro-spending']

In [81]:
label_dict = {'pro-immigration':0, 'anti-immigration':1, 'pro-guns':2, 'anti-guns':3, 'pro-medicare':4, 'oppose-medicare':5, 'pro-choice': 6, 'pro-life':7,
             'pro-spending':8, 'anti-spending':9, 'pro-free':10, 'anti-free':11, 'tax-wealthy':12, 'anti-tax-wealthy':13}


In [82]:
df.groupby('category').count()['text']

category
anti-free           666
anti-guns           745
anti-immigration    385
anti-spending       839
anti-tax-wealthy    587
oppose-medicare     639
pro-choice          886
pro-free            614
pro-guns            759
pro-immigration     900
pro-life            905
pro-medicare        673
pro-spending        439
tax-wealthy         950
Name: text, dtype: int64

In [83]:
data_y = df["category"].values
data_y = [label_dict[x] for x in data_y]
data_X = df["text"].values

In [84]:
# data_X, data_y = combine_data(data, labels)
print("Full dataset (combined): ", len(data_X))

Full dataset (combined):  9987


In [85]:
train_X, dev_X, train_y, dev_y = train_test_split( data_X, data_y, test_size=0.90, random_state=42)

In [86]:
train_X[33], train_y[33]

('That is the essence of democracy, and our democracy is based on the idea that we have a social contract that will ensure the well-being of all citizens — not just gun fetishists.',
 3)

In [87]:
collections.Counter(train_y)

Counter({8: 40,
         13: 68,
         3: 63,
         0: 81,
         6: 84,
         5: 58,
         4: 70,
         1: 40,
         12: 109,
         9: 85,
         7: 97,
         2: 71,
         11: 63,
         10: 69})

In [88]:
def sent_to_words(text):
    nltk.download("stopwords", quiet=True)
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
#     words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    
    return words

In [89]:
# See an example
sent_to_words(train_X[1])

['fortunately', 'i', 'm', 'not', 'alone']

In [90]:
words_train = [sent_to_words(sent) for sent in train_X]
words_dev = [sent_to_words(sent) for sent in dev_X]

In [92]:
cache_dir = os.path.join("../cache", "lstm_baseline_single_label")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_dev, labels_train, labels_dev, cache_dir=cache_dir, cache_file="preprocessed_data5.pkl"):
    """Convert each sentence to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        # words_train = list(map(review_to_words, data_train))
        # words_test = list(map(review_to_words, data_test))
        words_train = [sent_to_words(text) for text in data_train]
        words_dev = [sent_to_words(text) for text in data_dev]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_dev=words_dev,
                              labels_train=labels_train, labels_dev=labels_dev)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_dev, labels_train, labels_dev = (cache_data['words_train'],
                cache_data['words_dev'], cache_data['labels_train'], cache_data['labels_dev'])
    
    return words_train, words_dev, labels_train, labels_dev

In [93]:
# Preprocess data
train_X, dev_X, train_y, dev_y = preprocess_data(train_X, dev_X, train_y, dev_y)

Wrote preprocessed data to cache file: preprocessed_data5.pkl


# Transform Input

In [94]:
def build_dict(data, vocab_size = 50000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    # sentence is a list of words.
    
    # A dict storing the words that appear in the reviews along with how often they occur
    word_count = {} 
    
    # tally up the word counts
    for sentence in data:
        for word in sentence:
            word_count[word] = word_count[word] + 1 if word in word_count else 1
    
    # Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    # sorted_words[-1] is the least frequently appearing word. 
    word_count_sorted = sorted(word_count.items(), key=(lambda item: item[1]), reverse=True)
    sorted_words = [item[0] for item in word_count_sorted]
    
    # This is what we are building, a dictionary that translates words into integers
    word_dict = {} 
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [95]:

def convert_and_pad(word_dict, sentence, pad=128):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=128):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [96]:
word_dict = build_dict(train_X)

In [97]:
list(word_dict.keys())[:5]

['the', 'to', 'of', 'and', 'a']

In [98]:
## Save Progress so far

data_dir = './data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    print("making folder")
    os.makedirs(data_dir)

In [99]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    print("Pickling file")
    pickle.dump(word_dict, f)

Pickling file


In [100]:
train_X_num, train_X_len = convert_and_pad_data(word_dict, train_X)
dev_X_num, dev_X_len = convert_and_pad_data(word_dict, dev_X)

In [101]:
print(train_X[1])
print(train_X_num[1])

['fortunately', 'i', 'm', 'not', 'alone']
[2021   65  303   22  526    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [102]:
max([len(x) for x in train_X_num])

128

# Upload Data to Sagemaker

In [103]:
data_dir

'./data/pytorch'

In [104]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X_num)], axis=1) \
        .to_csv(os.path.join(data_dir, 'lstm_train.csv'), header=False, index=False)

In [105]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

In [106]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

# Build and Train Pytorch Model

In [107]:
!pygmentize train/model_lstm.py

[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m

[34mclass[39;49;00m [04m[32mLSTMClassifier[39;49;00m(nn.Module):
    [33m"""[39;49;00m
[33m    This is the simple RNN model we will be using to perform Sentiment Analysis.[39;49;00m
[33m    """[39;49;00m

    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size):
        [33m"""[39;49;00m
[33m        Initialize the model by settingg up the various layers.[39;49;00m
[33m        """[39;49;00m
        [36msuper[39;49;00m(LSTMClassifier, [36mself[39;49;00m).[32m__init__[39;49;00m()

        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=[34m0[39;49;00m)
        [36mself[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)
        [36mself[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features=[34m14[39;49;00m)
        [36mself[39;49;00m.sm = nn.Soft

In [30]:
import torch
import torch.utils.data

# Read in only the first 500 rows for testing
train_sample = pd.read_csv(os.path.join(data_dir, 'lstm_train.csv'), header=None, names=None, nrows=500)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).long().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [31]:
train_sample[[0]].values

array([[ 8],
       [13],
       [ 3],
       [13],
       [ 0],
       [ 6],
       [ 3],
       [13],
       [ 5],
       [ 4],
       [ 1],
       [ 3],
       [12],
       [ 9],
       [ 7],
       [ 4],
       [ 7],
       [ 5],
       [ 2],
       [11],
       [ 9],
       [ 9],
       [ 0],
       [ 1],
       [ 1],
       [11],
       [ 2],
       [ 5],
       [ 9],
       [12],
       [ 8],
       [ 7],
       [ 4],
       [ 3],
       [11],
       [ 6],
       [ 4],
       [ 5],
       [ 7],
       [12],
       [ 1],
       [ 6],
       [13],
       [ 0],
       [13],
       [ 2],
       [ 3],
       [ 6],
       [ 6],
       [ 4],
       [10],
       [ 2],
       [ 6],
       [12],
       [ 9],
       [ 6],
       [ 5],
       [ 1],
       [ 9],
       [ 2],
       [11],
       [ 9],
       [13],
       [ 4],
       [ 8],
       [12],
       [ 4],
       [ 2],
       [ 7],
       [ 9],
       [ 2],
       [10],
       [ 2],
       [ 0],
       [ 0],
       [ 2],
       [ 6],

In [32]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            optimizer.zero_grad()
            output = model(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, Cross Entropy Loss: {}".format(epoch, total_loss / len(train_loader)))

In [33]:
import torch.optim as optim
from train.model_lstm import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(100, 100, 50000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

Epoch: 1, Cross Entropy Loss: 2.6392315864562987
Epoch: 2, Cross Entropy Loss: 2.634082627296448
Epoch: 3, Cross Entropy Loss: 2.6289652824401855
Epoch: 4, Cross Entropy Loss: 2.6221601009368896
Epoch: 5, Cross Entropy Loss: 2.6114129304885862


In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for bt in train_sample_dl:
    bX, bY = bt
    
    bX = bX.to(device)
    bY = bY.to(device)
    
    model = LSTMClassifier(100, 100, 50000).to(device)
    optimizer.zero_grad()
    output = model(bX)
    print(output.shape)

torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])
torch.Size([50, 14])


# Now do it on the full training dataset!

In [137]:
estimator = PyTorch(entry_point="train_lstm.py",
                    source_dir="train",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.p2.xlarge',
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200,
                    })

In [138]:
estimator.fit({'training': input_data})

2020-04-06 05:43:02 Starting - Starting the training job...
2020-04-06 05:43:04 Starting - Launching requested ML instances......
2020-04-06 05:44:05 Starting - Preparing the instances for training......
2020-04-06 05:45:20 Downloading - Downloading input data...
2020-04-06 05:45:56 Training - Downloading the training image...
2020-04-06 05:46:27 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-04-06 05:46:26,812 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-04-06 05:46:26,836 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-04-06 05:46:29,890 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-04-06 05:46:30,171 sagemaker-containers INFO     Module train_lstm does not provide a setup.py

In [139]:
# TODO: Deploy the trained model
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------!

In [140]:
# Evaluate on Dev Dataset

In [141]:
dev_X = pd.concat([pd.DataFrame(dev_X_len), pd.DataFrame(dev_X_num)], axis=1)

In [142]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, predictor.predict(array))
    
    return predictions

In [143]:
predictions = predict(dev_X.values)
predictions_reshape = predictions.reshape((int(predictions.shape[0]/14), 14))
pred_vals = np.argmax(predictions_reshape, axis=1)

In [145]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
print('Accuracy:', accuracy_score(dev_y, pred_vals))
print('F1:', f1_score(dev_y, pred_vals, average = 'micro'))

Accuracy: 0.09500500611858939
F1: 0.09500500611858939


# Don't Forget to Delete!!!

In [146]:
predictor.delete_endpoint()