# CS 182: RNN

We decided to use two data sets.

# Github Data 

Our primary data set is from Peter Downs via Github.

Link: https://github.com/peterldowns/clickbait-classifier

In [6]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import subprocess
import os

import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf

from tensorflow.contrib import learn

FLAGS = None

MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0


def bag_of_words_model(features, target):
  """A bag-of-words model. Note it disregards the word order in the text."""
  target = tf.one_hot(target, 15, 1, 0)
  features = tf.contrib.layers.bow_encoder(
      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
  logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None)
  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)
  return (
      {'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits)},
      loss, train_op)


def rnn_model(features, target):
  """RNN model to predict from sequence of words to a class."""
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = tf.contrib.layers.embed_sequence(
      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')

  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = tf.unstack(word_vectors, axis=1)

  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
  cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)

  # Create an unrolled Recurrent Neural Networks to length of
  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
  _, encoding = tf.nn.rnn(cell, word_list, dtype=tf.float32)

  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
  # neural network of last step) and pass it as features for logistic
  # regression over output classes.
  target = tf.one_hot(target, 2, 1, 0)
  logits = tf.contrib.layers.fully_connected(encoding, 2, activation_fn=None)
  loss = tf.contrib.losses.softmax_cross_entropy(logits, target)

  # Create a training op.
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)

  return (
      {'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits)},
      loss, train_op)

In [7]:
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
def removeCommas(s): return "".join(i for i in s if i != ',')

def modifyCSV(filename):
    augmented_df = pandas.read_csv(filename)
    del augmented_df['article_url']
    del augmented_df['source']
    
    cleaner = lambda x: removeCommas(removeNonAscii(x))
    augmented_df['article_title'] = augmented_df['article_title'].map(cleaner)
    mycsv = augmented_df.to_csv(index=False, header=False)
    newarticles = open('rnn_' + filename, 'w')
    newarticles.write(mycsv)
    newarticles.close()
    
modifyCSV('train.csv')
modifyCSV('test.csv')

In [8]:
from tensorflow.contrib.learn.python.learn.datasets import base

train = base.load_csv_without_header(
    filename="rnn_train.csv",
    target_dtype=np.int,
    features_dtype=np.str)
test = base.load_csv_without_header(
    filename="rnn_test.csv",
    target_dtype=np.int,
    features_dtype=np.str)

dataset = base.Datasets(train=train, validation=None, test=test)
x_train = pandas.DataFrame(dataset.train.data)[0]
y_train = pandas.Series(dataset.train.target)
x_test = pandas.DataFrame(dataset.test.data)[0]
y_test = pandas.Series(dataset.test.target)

In [9]:
# Process vocabulary
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
x_train = np.array(list(vocab_processor.fit_transform(x_train)))
x_test = np.array(list(vocab_processor.transform(x_test)))
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

Total words: 16084


In [10]:
### RNN Classifier ###

model_fn = rnn_model
classifier = learn.Estimator(model_fn=model_fn)
classifier.fit(x_train, y_train, steps=100)
y_predicted = [
      p['class'] for p in classifier.predict(x_test, as_iterable=True)]
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'save_summary_steps': 100, '_num_ps_replicas': 0, '_task_type': None, '_environment': 'local', '_is_chief': True, 'save_checkpoints_secs': 600, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123868050>, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, 'tf_random_seed': None, 'keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', 'save_checkpoints_steps': None, '_master': '', 'keep_checkpoint_max': 5}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_si