# Understanding User Comments via Sentiment Analysis

---
*TensorFlow*

Nathaniel Haddad - 2019

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import urllib
import pandas as pd

In [None]:
tf.__version__

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
VOCAB_SIZE=1000

## Functions

In [None]:
def create_model_LSTM():
    return tf.keras.Sequential([
        encoder,
        tf.keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()),
            output_dim=64,
            # Use masking to handle the variable sequence lengths
            mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

In [None]:
def plot_history(history):
    plt.figure(figsize=(16,8))
    plt.subplot(1,2,1)
    plot_graphs(history, 'accuracy')
    plt.ylim(None,1)
    plt.subplot(1,2,2)
    plot_graphs(history, 'loss')
    plt.ylim(0,None)

In [None]:
def download_file(url: str, fname: str) -> None:
    """
    function: download_file
    param(s): url (str): url to files; fname (str): the filename
    returns: nothing
    does: downloads files to local directory
    """
    urllib.request.urlretrieve(url, fname)

In [None]:
def plot_graphs(history, metric):
    """
    from tensorflow docs
    """
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

## Data

In [None]:
# download annotated comments and annotations
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637'

download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

Create pandas dataframes for quick data preprocessing

In [None]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [None]:
# labels a comment as an atack if the majority of annoatators did so
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [None]:
# join labels and comments
comments['attack'] = labels

In [None]:
# remove newline and tab tokens
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

In [None]:
comments.head(3)

Convert pandas dataframe to TensorFlow tf.data.Dataset

In [None]:
train_data = comments.query("split=='train'")
test_data = comments.query("split=='test'")

train_labels = train_data.pop('attack')
test_labels = test_data.pop('attack')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_data['comment'].values, train_labels.values))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data['comment'].values, test_labels.values))

In [None]:
train_dataset.element_spec

Get an example feature and label from training set

In [None]:
for example, label in train_dataset.take(3):
    print('text: {}\n'.format(example.numpy()))
    print('label: {}'.format(label.numpy()))

Shuffle training and test datasets by specidied buffer and batch sizes

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(3):
    print('texts: {}\n'.format(example.numpy()[:3]))
    print('labels: {}'.format(label.numpy()[:3]))

## Text preprocessing

In [None]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

After text vectorization, get vocabulary returns most frequent vocabulary (including padding and unknowns)

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

Create indices for the tensors (zero-padded based on longest length of sequences)

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
for n in range(3):
    print("Original: {}".format(example[n].numpy()))
    print("Round-trip: {}\n".format(" ".join(vocab[encoded_example[n]])))

## Training

In [None]:
model = create_model_LSTM()

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
# predict on a sample text without padding.
sample_text = ('What a great addition to Wikipedia '
               'thanks so much for your contribution.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

In [None]:
# predict on a sample text with padding
padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=5,
                    validation_data=test_dataset, 
                    validation_steps=5)

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

## Metrics

In [None]:
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plot_history(history)

## Test examples

In [None]:
sample_text = ('This is a good comment. Great job!')
predictions = model.predict(np.array([sample_text]))

In [None]:
sample_text = ('This is a bad comment. You are a terrible person!'
              'No one should ever have to read your stupid ideas!')
predictions = model.predict(np.array([sample_text]))