# Computational Intelligence Lab 2018 -- Text Sentiment Classification
### Larissa Laich, Lukas Jendele, Michael Wiegner, Ondrej Skopek
Department of Computer Science, ETH Zurich, Switzerland

## Project definition
The use of microblogging and text messaging as a media of communication has greatly increased over the past 10 years. Such large volumes of data amplifies the need for automatic methods to understand the opinion conveyed in a text.

### Resources
All the necessary resources (including training data) are available at https://inclass.kaggle.com/c/cil-text-classification-2017

### Training Data
For this problem, we have acquired 2.5M tweets classified as either positive or negative.

### Evaluation Metrics
*Classification Accuracy*

## Data exploration notebook

In [0]:
import sklearn as skl
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [0]:
#@title Google Drive Code

from google.colab import auth
import io
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload
auth.authenticate_user()
drive_service = build('drive', 'v3')


def uploadToDrive(filepath, name, mimeType="text/plain"):
  # Upload the file to Drive. See:
  #
  # https://developers.google.com/drive/v3/reference/files/create
  # https://developers.google.com/drive/v3/web/manage-uploads

  file_metadata = {
    'name': name,
    'mimeType': mimeType
  }
  media = MediaFileUpload(filepath, 
                          mimetype=mimeType,
                          resumable=True)
  created = drive_service.files().create(body=file_metadata,
                                         media_body=media,
                                         fields='id').execute()
  print('File ID: %s' % created.get('id'))

def downloadFromDrive(file_id, downloaded):
  # Download the file we just uploaded.
  #
  # Replace the assignment below with your file ID
  # to download a different file.
  #
  # A file ID looks like: 1uBtlaggVyWshwcyP6kEI-y_W3P8D26sz

  request = drive_service.files().get_media(fileId=file_id)
  downloader = MediaIoBaseDownload(downloaded, request)
  done = False
  last_progress = 0
  print("Downloading %s:\t%d%%..." % (file_id, last_progress))
  while done is False:
    status, done = downloader.next_chunk()
    progress = int(status.progress() * 100)
    if last_progress + 10 == progress:
      last_progress = progress
      print("Downloading %s:\t%d%%..." % (file_id, last_progress))
  print("Downloading %s:\t%d%%..." % (file_id, 100))


def findAndDownload(name):
  fname = "/tmp/" + name
  with open(fname, "wb") as f:
    results = drive_service.files().list(q='name contains "' + name + '"').execute()
    if len(results["files"]) <= 0:
      raise ValueError("No such file found: " + name)
    downloadFromDrive(results["files"][0]["id"], f)
  return fname

In [0]:
def read_lines(filename):
  content = []
  with open(filename) as f:
    content = f.readlines()
    content = [x.strip() for x in content] # strip and split by space
  return content

# File description
* `train_pos.txt` and `train_neg.txt` -- a small set of training tweets for each of the two classes.
* `train_pos_full.txt` and `train_neg_full.txt` -- a complete set of training tweets for each of the two classes, about 1M tweets per class.
* `test_data.txt` -- the test set, that is the tweets for which you have to predict the sentiment label.
* `sampleSubmission.csv` -- a sample submission file in the correct format, note that each test tweet is numbered.
  * Submission of predictions: -1 = negative prediction, 1 = positive prediction

Note that all tweets have been tokenized already, so that the words and punctuation are properly separated by a whitespace.

In [0]:
# TODO: Try more preprocessing: some smileys are not detected (<3), etc; what about case (upper/lower)? Remove URLs?
# TODO: Look at RCNN, Lukas thinks it did exactly this thing on this dataset (custom LSTM cell)
# TODO: Look at number of unique words (or actually, at the words themselves)

In [0]:
# Should finish in about ~10sec each for the small datasets
# train_pos_filename="train_pos.txt"
# train_neg_filename="train_neg.txt"
train_pos_filename="train_pos_full.txt"
train_neg_filename="train_neg_full.txt"
test_filename="test_data.txt" # TODO: test data
train_pos_file="/tmp/" + train_pos_filename
train_neg_file="/tmp/" + train_neg_filename
X_pos = read_lines(findAndDownload(train_pos_filename))
X_neg = read_lines(findAndDownload(train_neg_filename))

In [0]:
# Glove Twitter embeddings (TODO: Try different ones)
#!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
#!unzip glove.twitter.27B.zip
# Produces files: glove.twitter.27B.100d.txt  glove.twitter.27B.200d.txt  glove.twitter.27B.25d.txt  glove.twitter.27B.50d.txt

In [0]:
SEED = 42
np.random.seed(SEED)
from sklearn.model_selection import train_test_split
y_s = [+1] * len(X_pos) + [-1] * len(X_neg)
X_s = X_pos + X_neg
X_train, X_dev, y_train, y_dev = train_test_split(X_s, y_s, random_state=SEED, test_size=0.8)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, random_state=SEED, test_size=0.01)

In [0]:
import numpy as np

class CILDataset:
    """Class capable of loading CIL Twitter dataset."""

    def __init__(self, lines, sentiments, add_bow_eow=False, train=None):
        """Load dataset from the given files.

        Arguments:
        add_bow_eow: Whether to add BOW/EOW characters to the word characters.
        train: If given, the vocabularies from the training data will be reused.
        """

        # Create vocabulary_maps
        if train:
            self._vocabulary_maps = train._vocabulary_maps
        else:
            self._vocabulary_maps = {'chars': {'<pad>': 0, '<unk>': 1, '<bow>': 2, '<eow>': 3},
                                     'words': {'<pad>': 0, '<unk>': 1, '\n': 2}, # \n represents EOS
                                     'tags': {'<pad>': 0, '<unk>': 1, '\n': 2}, # \n represents EOS
                                     'sentiments': {}}
        self._word_ids = []
        self._charseq_ids = []
        self._charseqs_map = {'<pad>': 0}
        self._charseqs = []
        self._sentiments = []

        # Load the sentences
        for idx, line in enumerate(lines):
            sentiment = sentiments[idx]
            line = line.rstrip("\r\n")
            if not train:
                if sentiment not in self._vocabulary_maps['sentiments']:
                    self._vocabulary_maps['sentiments'][sentiment] = len(self._vocabulary_maps['sentiments'])
            self._sentiments.append(self._vocabulary_maps['sentiments'][sentiment])

            self._word_ids.append([])
            self._charseq_ids.append([])
            for word_s in line.split(" "):
                word = word_s if len(word_s) else "\n"

                # Characters
                if word not in self._charseqs_map:
                    self._charseqs_map[word] = len(self._charseqs)
                    self._charseqs.append([])
                    if add_bow_eow:
                        self._charseqs[-1].append(self._vocabulary_maps['chars']['<bow>'])
                    for c in word:
                        if c not in self._vocabulary_maps['chars']:
                            if not train:
                                self._vocabulary_maps['chars'][c] = len(self._vocabulary_maps['chars'])
                            else:
                                c = '<unk>'
                        self._charseqs[-1].append(self._vocabulary_maps['chars'][c])
                    if add_bow_eow:
                        self._charseqs[-1].append(self._vocabulary_maps['chars']['<eow>'])
                self._charseq_ids[-1].append(self._charseqs_map[word])

                # Words
                if word not in self._vocabulary_maps['words']:
                    if not train:
                        self._vocabulary_maps['words'][word] = len(self._vocabulary_maps['words'])
                    else:
                        word = '<unk>'
                self._word_ids[-1].append(self._vocabulary_maps['words'][word])

        # Compute sentence lengths
        sentences = len(self._word_ids)
        self._sentence_lens = np.zeros([sentences], np.int32)
        for i in range(sentences):
            self._sentence_lens[i] = len(self._word_ids[i])

        # Create vocabularies
        if train:
            self._vocabularies = train._vocabularies
        else:
            self._vocabularies = {}
            for feature, words in self._vocabulary_maps.items():
                self._vocabularies[feature] = [""] * len(words)
                for word, id in words.items():
                    self._vocabularies[feature][id] = word

        self._permutation = np.random.permutation(len(self._sentence_lens))

        
    def vocabulary(self, feature):
        """Return vocabulary for required feature.

        The features are the following:
        words
        chars
        sentiments
        """
        return self._vocabularies[feature]

    def next_batch(self, batch_size):
        """Return the next batch.

        Arguments:
        Returns: (sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens, sentiments)
        sequence_lens: batch of sentence_lens
        word_ids: batch of word_ids
        charseq_ids: batch of charseq_ids (the same shape as word_ids, but with the ids pointing into charseqs).
        charseqs: unique charseqs in the batch, indexable by charseq_ids;
          contain indices of characters from vocabulary('chars')
        charseq_lens: length of charseqs
        sentiments: batch of sentiments
        """

        batch_size = min(batch_size, len(self._permutation))
        batch_perm = self._permutation[:batch_size]
        self._permutation = self._permutation[batch_size:]
        return self._next_batch(batch_perm)

    def epoch_finished(self):
        if len(self._permutation) == 0:
            self._permutation = np.random.permutation(len(self._sentence_lens))
            return True
        return False

    def whole_data_as_batch(self):
        """Return the whole dataset in the same result as next_batch.

        Returns the same results as next_batch.
        """
        return self._next_batch(np.arange(len(self._sentence_lens)))

    def _next_batch(self, batch_perm):
        batch_size = len(batch_perm)

        # General data
        batch_sentence_lens = self._sentence_lens[batch_perm]
        max_sentence_len = np.max(batch_sentence_lens)

        # Word-level data
        batch_word_ids = np.zeros([batch_size, max_sentence_len], np.int32)
        for i in range(batch_size):
            batch_word_ids[i, 0:batch_sentence_lens[i]] = self._word_ids[batch_perm[i]]
        
        batch_sentiments = np.zeros([batch_size], np.int32)
        for i in range(batch_size):
            batch_sentiments[i] = self._sentiments[batch_perm[i]]

        # Character-level data
        batch_charseq_ids = np.zeros([batch_size, max_sentence_len], np.int32)
        charseqs_map, charseqs, charseq_lens = {}, [], []
        for i in range(batch_size):
            for j, charseq_id in enumerate(self._charseq_ids[batch_perm[i]]):
                if charseq_id not in charseqs_map:
                    charseqs_map[charseq_id] = len(charseqs)
                    charseqs.append(self._charseqs[charseq_id])
                batch_charseq_ids[i, j] = charseqs_map[charseq_id]

        batch_charseq_lens = np.array([len(charseq) for charseq in charseqs], np.int32)
        batch_charseqs = np.zeros([len(charseqs), np.max(batch_charseq_lens)], np.int32)
        for i in range(len(charseqs)):
            batch_charseqs[i, 0:len(charseqs[i])] = charseqs[i]

        return batch_sentence_lens, batch_word_ids, batch_charseq_ids, batch_charseqs, batch_charseq_lens, batch_sentiments


In [0]:
import datetime

import numpy as np
import tensorflow as tf

class Network:
    CLASSES = 2

    def __init__(self, rnn_cell, rnn_cell_dim, num_words, num_chars, logdir, expname, threads=1, seed=SEED, word_embedding=100, char_embedding=100, keep_prob=0.5, learning_rate=1e-4):
        # Create an empty graph and a session
        graph = tf.Graph()
        graph.seed = seed
        self.session = tf.Session(graph=graph, config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                                                     intra_op_parallelism_threads=threads))

        timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
        self.summary_writer = tf.summary.FileWriter("{}/{}-{}".format(logdir, timestamp, expname), flush_secs=10)

        # Construct the graph
        with self.session.graph.as_default():
            if rnn_cell == "LSTM":
                rnn_cell_co = tf.nn.rnn_cell.LSTMCell(rnn_cell_dim)
            elif rnn_cell == "GRU":
                rnn_cell_co = tf.nn.rnn_cell.GRUCell(rnn_cell_dim)
            else:
                raise ValueError("Unknown rnn_cell {}".format(rnn_cell))

            self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="global_step")
            self.sentence_lens = tf.placeholder(tf.int32, [None])
            self.word_ids = tf.placeholder(tf.int32, [None, None])
            self.charseq_ids = tf.placeholder(tf.int32, [None, None])
            self.charseqs = tf.placeholder(tf.int32, [None, None])
            self.charseq_lens = tf.placeholder(tf.int32, [None])
            self.sentiments = tf.placeholder(tf.int32, [None])
            self.is_training = tf.placeholder_with_default(False, [])
            self.keep_prob = tf.placeholder_with_default(1.0, [])

            rnn_cell_co = tf.nn.rnn_cell.DropoutWrapper(rnn_cell_co, self.keep_prob, self.keep_prob)

            if char_embedding == -1:
                input_chars = tf.one_hot(self.charseqs, num_chars)
            else:
                input_chars = tf.nn.embedding_lookup(tf.get_variable("char_emb", shape=[num_chars, char_embedding]),
                                                     self.charseqs)
            print("input_chars", input_chars.get_shape())

            if rnn_cell == "LSTM":
                rnn_cell_ce = tf.nn.rnn_cell.LSTMCell(rnn_cell_dim)
            elif rnn_cell == "GRU":
                rnn_cell_ce = tf.nn.rnn_cell.GRUCell(rnn_cell_dim)
            else:
                raise ValueError("Unknown rnn_cell {}".format(rnn_cell))
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(rnn_cell_ce, rnn_cell_ce, input_chars,
                                                                          self.charseq_lens, dtype=tf.float32, scope="rnn_chars")
            input_chars = state_fw + state_bw
            print("input_chars", input_chars.get_shape())

            input_char_words = tf.nn.embedding_lookup(input_chars, self.charseq_ids)
            print("input_char_words", input_char_words.get_shape())

            if word_embedding == -1:
                input_words = tf.one_hot(self.word_ids, num_words)
            else:
                input_words = tf.nn.embedding_lookup(tf.get_variable("word_emb", shape=[num_words, word_embedding]),
                                                     self.word_ids)
            
            # TODO: Add GLOVE
            
            print("input_words", input_words.get_shape())
            inputs = tf.concat([input_char_words, input_words], axis=2)
            print("inputs", inputs.get_shape())

            (outputs_fw, outputs_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(rnn_cell_co, rnn_cell_co, inputs, self.sentence_lens, dtype=tf.float32)
            states = tf.concat([state_fw, state_bw], axis=1)
            print("states", states.get_shape())

            hidden = tf.layers.dense(states, 300, activation=tf.nn.leaky_relu)
            d1 = tf.layers.dropout(hidden, rate=self.keep_prob)
            output_layer = tf.layers.dense(d1, self.CLASSES, activation=None)
            print("output_layer", output_layer.get_shape())

            self.loss = tf.losses.sparse_softmax_cross_entropy(self.sentiments, output_layer,
                                                               reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
            self.training = tf.train.AdamOptimizer(learning_rate).minimize(self.loss, self.global_step)
            self.predictions = tf.cast(tf.argmax(output_layer, 1), tf.int32)
            self.accuracy = tf.contrib.metrics.accuracy(self.sentiments, self.predictions)

            self.train_summary = tf.summary.merge([tf.summary.scalar("loss", self.loss, family="train"),
                                                   tf.summary.scalar("accuracy", self.accuracy, family="train")])
            self.dev_summary = tf.summary.merge([tf.summary.scalar("loss", self.loss, family="dev"),
                                                 tf.summary.scalar("accuracy", self.accuracy, family="dev")])

            # Initialize variables
            self.session.run(tf.initialize_all_variables())

    @property
    def training_step(self):
        return self.session.run(self.global_step)

    def train_epoch(self, data):
        while not data.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens, sentiments = \
                data.next_batch(FLAGS.batch_size)
            _, summary = \
                self.session.run([self.training, self.train_summary],
                                 {self.sentence_lens: sentence_lens, self.word_ids: word_ids,
                                  self.charseq_ids: charseq_ids, self.charseqs: charseqs, self.charseq_lens: charseq_lens,
                                  self.sentiments: sentiments, self.is_training: True, self.keep_prob: FLAGS.keep_prob})
            self.summary_writer.add_summary(summary, self.training_step)
            print(".", end='')
        print("")
            

    def evaluate_epoch(self, data):
        epoch_acc = 0
        epoch_loss = 0
        cnt = 0
        while not data.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens, sentiments = \
                data.next_batch(FLAGS.batch_size)
            accuracy, loss, summary = \
                self.session.run([self.accuracy, self.loss, self.dev_summary],
                                 {self.sentence_lens: sentence_lens, self.word_ids: word_ids,
                                  self.charseq_ids: charseq_ids, self.charseqs: charseqs, self.charseq_lens: charseq_lens,
                                  self.sentiments: sentiments})
            self.summary_writer.add_summary(summary, self.training_step)
            epoch_acc += accuracy
            epoch_loss += loss
            cnt += 1
        # TODO: Proper per epoch summaries for dev
        return (epoch_acc / cnt, epoch_loss / cnt)


    def predict_epoch(self, data):
        predictions = []
        while not data.epoch_finished():
            sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens, sentiments = \
                data.next_batch(FLAGS.batch_size)
            batch_predictions = self.session.run(self.predictions,
                                {self.sentence_lens: sentence_lens, self.word_ids: word_ids,
                                 self.charseq_ids: charseq_ids, self.charseqs: charseqs, self.charseq_lens: charseq_lens})
            predictions.extend(batch_predictions)
        return predictions

In [0]:
import os

global FLAGS
FLAGS = tf.app.flags.FLAGS


def define_flags():
    def reset_flags():
        import argparse as _argparse
        tf.app.flags._global_parser = _argparse.ArgumentParser()
        
    reset_flags()
    # Directories
    if os.name == 'nt':
        tf.app.flags.DEFINE_string('checkpoint_dir', 'e:/temp/tensorflow/checkpoints/',
                                   'Directory to save checkpoints in (once per epoch)')
    else:
        tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/tensorflow/checkpoints/',
                                   'Directory to save checkpoints in (once per epoch)')

    # TF parameters
    tf.app.flags.DEFINE_boolean(
        "no_gpu", False, 'Disables GPU usage even if a GPU is available')

    # Optimization parameters
    tf.app.flags.DEFINE_integer('epochs', 15, 'Training epoch count')
    tf.app.flags.DEFINE_integer('batch_size', 1000, 'Training batch size')

    # Jupyter notebook params
    # Only to avoid raising UnrecognizedFlagError
    tf.app.flags.DEFINE_string('f', 'kernel', 'Kernel')
    
    tf.app.flags.DEFINE_string('logdir', 'logs', 'Logdir name.')
    tf.app.flags.DEFINE_string('rnn_cell', "GRU", 'RNN cell type.')
    tf.app.flags.DEFINE_integer('rnn_cell_dim', 100, 'RNN cell dimension.')
    tf.app.flags.DEFINE_integer('threads', 8, 'Maximum number of threads to use.')
    tf.app.flags.DEFINE_integer('word_embedding', 100, 'word_embedding')
    tf.app.flags.DEFINE_integer('char_embedding', 100, 'char_embedding')
    tf.app.flags.DEFINE_float('keep_prob', 0.5, 'dropout probability')
    tf.app.flags.DEFINE_float('learning_rate', 1e-4, 'learning rate')

In [0]:
# Fix random seed
np.random.seed(SEED)

# Parse arguments
define_flags()

In [0]:
import sys

# Load the data
data_train = CILDataset(X_train, y_train)
data_dev = CILDataset(X_dev, y_dev, train=data_train)
data_test = CILDataset(X_test, y_test, train=data_train)

In [0]:
# Construct the network
print("Constructing the network.", file=sys.stderr)
expname = "{}{}-bs{}-epochs{}-char{}-word{}".format(FLAGS.rnn_cell, FLAGS.rnn_cell_dim, FLAGS.batch_size, FLAGS.epochs, FLAGS.char_embedding, FLAGS.word_embedding)
network = Network(rnn_cell=FLAGS.rnn_cell, rnn_cell_dim=FLAGS.rnn_cell_dim,
                  num_words=len(data_train.vocabulary('words')), num_chars=len(data_train.vocabulary('chars')),
                  logdir=FLAGS.logdir, expname=expname, threads=FLAGS.threads,
                  word_embedding=FLAGS.word_embedding, char_embedding=FLAGS.char_embedding,
                  keep_prob=FLAGS.keep_prob, learning_rate=FLAGS.learning_rate)

# Train
best_dev_accuracy = 0
test_predictions = None

for epoch in range(FLAGS.epochs):
    print("Training epoch {}".format(epoch + 1), file=sys.stderr)
    network.train_epoch(data_train)
    dev_accuracy, dev_loss = network.evaluate_epoch(data_dev)
    print("Development accuracy after epoch {} is {:.2f}. Dev loss is {:.2f}".format(epoch + 1, 100. * dev_accuracy, dev_loss), file=sys.stderr)

# TODO: Enable predictions best saving
#     if dev_accuracy > best_dev_accuracy:
#         best_dev_accuracy = dev_accuracy
#         sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens, sentiments = \
#             data_test.whole_data_as_batch()
#         test_predictions = network.predict_epoch(sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens)

# Print test predictions
# for prediction in test_predictions:
#     print(data_test.vocabulary('sentiments')[prediction])