## W266 Final Project: Project Milestone

### Classifying the Political Ideology of News Articles

#### Matt Acconciamessa and Megan Pera



In [1]:
%matplotlib inline

# Import necessary libraries
import pickle
import numpy as np
from scipy import sparse
import collections
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn import metrics



### Loading, Cleaning and Exploring Data

In [2]:
# Load and save data into liberal, conservative and neutral objects
[lib, con, neutral] = pickle.load(open('ibcData.pkl', 'rb'))

In [3]:
# Data samples, by classification
print ('Liberal examples (out of ', len(lib), ' sentences): ')
for tree in lib[0:5]:
    print(tree.get_words())
    
print ('\nConservative examples (out of ', len(con), ' sentences): ')
for tree in con[0:5]:
    print (tree.get_words())
    
print ('\nNeutral examples (out of ', len(neutral), ' sentences): ')
for tree in neutral[0:5]:
    print (tree.get_words())

Liberal examples (out of  2025  sentences): 
Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most .
Because it would not be worthwhile to bring a case for $ 30.22 , the arbitration clause would , as a practical matter , deny the Concepcions any relief and , more important , eliminate a class action that might punish AT&T for its pattern of fraudulent behavior .
Indeed , Lind argues that high profits and high wages reinforce each other because workers then have the wherewithal to buy the products they are making .
In fairness , it should be noted that he devotes an entire chapter to New York Times political columnist Maureen Dowd , a liberal who makes much of the outsized rivalries , jealousies , and personalities that dominate American politics .
Psychological tactics are social control techniques that operate at the level of the mind , with th

In [4]:
# Formatting data into workable arrays
liberal = np.array(lib)
conserv = np.array(con)
neut = np.array(neutral)

# Seprating data and labels
def separate_data_and_labels(label_class):
    labels = []
    data = []
    for i in range(len(label_class)):
        for node in label_class[i]:
            if hasattr(node, 'label'):
                data.append(node.get_words())
                labels.append(node.label)
    data = np.array(data)
    labels = np.array(labels)
    return data, labels

lib_data, lib_labs = separate_data_and_labels(liberal)
con_data, con_labs = separate_data_and_labels(conserv)
neut_data, neut_labs = separate_data_and_labels(neut)

In [5]:
print('Examples:')
print ('\n Liberal')
print(lib_data[0],'\n',lib_labs[0:10])
print ('\n Conservative')
print(con_data[0],'\n',con_labs[0:10])
print ('\n Neutral')
print(neut_data[0],'\n',neut_labs[0:10])

Examples:

 Liberal
Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most . 
 ['Liberal' 'Liberal' 'Liberal' 'Liberal' 'Liberal' 'Liberal' 'Liberal'
 'Liberal' 'Liberal' 'Liberal']

 Conservative
Gore is getting rich from environmentalism , not just by being paid a whopping $ 175,000 per speech but by using political pressure to force government policy in a direction that benefits his business interests . 
 ['Conservative' 'Conservative' 'Conservative' 'Conservative' 'Neutral'
 'Neutral' 'Neutral' 'Conservative' 'Liberal' 'Liberal']

 Neutral
In this country , the beneficiaries of Apple 's success are , first , the designers , who have done wonders working with Steve Jobs to produce products that are beautiful and effective . 
 ['Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral' 'Neutral'
 'Neutral' 'Neutral' 'Neutral']


In [6]:
# Combining into one dataset
data_all = np.concatenate((neut_data, lib_data, con_data), axis=0)
labs_all = np.concatenate((neut_labs, lib_labs, con_labs), axis=0)

print (data_all.shape)
print (labs_all.shape)

(22621,)
(22621,)


In [7]:
# Randomly mixing data&labels so that they can be split into test and train
def shuffle_in_unison(a, b):
    assert len(a) == len(b)
    shuffled_a = np.empty(a.shape, dtype=a.dtype)
    shuffled_b = np.empty(b.shape, dtype=b.dtype)
    permutation = np.random.permutation(len(a))
    for old_index, new_index in enumerate(permutation):
        shuffled_a[new_index] = a[old_index]
        shuffled_b[new_index] = b[old_index]
    return shuffled_a, shuffled_b

data_all, labs_all = shuffle_in_unison(data_all, labs_all)

In [None]:
# Split data into test (20%) and train (80%)
slice = int(.8*labs_all.shape[0])
data_train = data_all[:slice]
labs_train = labs_all[:slice]
data_test = data_all[slice:]
labs_test = labs_all[slice:]
print(labs_all.shape)
print(labs_test.shape)
print(labs_train.shape)

In [None]:
# Turning dataset into word tokens
count_vect = CountVectorizer()
data = count_vect.fit_transform(data_train).toarray()
vocab = count_vect.get_feature_names()

# Counting the number of times each word appears
np.clip(data,0,1, out = data) #make sure each word only appears once in the array
dist = np.sum(data, axis = 0) #sum the columns
counts = list(zip(vocab,dist)) #zip counts and words together

# Total vocab size and word count
print("Total word count:",np.sum(dist))
print("Vocabulary size:",len(vocab))

In [None]:
# Printing out the 20 most popular words
counts = sorted(counts, key=lambda x: x[1], reverse=True) 
counts[0:20]

In [None]:
# Plotting top 50 results
ordered = list(zip(*counts))
x = ordered[0][:50] #counts
y = ordered[1][:50] #words

# Plotting figure
fig = plt.figure(figsize=(15.0,6.0))
indexes = np.arange(50)
width = .5
plt.bar(indexes, y, width)
plt.xticks(indexes + width * 0.5, x,rotation=70)
plt.show()

### Baseline model: Multinomial Naive Bayes


In [None]:
# This model predicts the political leanings of sentences and sub-sentences

# Training the model
vect = CountVectorizer()
train_vocab = vect.fit_transform(data_train)
test_vocab = vect.transform(data_test)

# Scoring the model
print("")
print("Multinomial Naive Bayes:")
for a in [0.0001, 0.01, .05, 0.1, 0.2, 1.0]:
    mnb = MultinomialNB(alpha=a)
    mnb.fit(train_vocab, labs_train)
    mnbpreds = mnb.predict(test_vocab)
    print("alpha:", a, "F1:", metrics.f1_score(labs_test,mnbpreds,average='weighted'))

In [None]:
# Showing examples for alpha = 0.001
mnb = MultinomialNB(alpha=0.001)
mnb.fit(train_vocab, labs_train)
mnbpreds = mnb.predict(test_vocab)
mnbpred_prob = mnb.predict_proba(test_vocab)
probs = list(zip(data_test.tolist(),mnbpreds.tolist(),mnbpred_prob.tolist()))

for i in range(0,5):
    print('Sentence:',probs[i][0])
    print('Actual Label:',labs_test[i])
    print('Predicted Label:',probs[i][1])
    print('Predicted Label Probability:', max(probs[i][2]),'\n')

In [None]:
# Finding and printing out mistakes
errors = []
for i in range(0,len(probs)):
    if labs_test[i] == probs[i][1]:
        pass
    else:
        errors.append(i)
        
print('MNB missclassified',len(errors),'sentences','\n')

for i in errors[0:5]:
    print('Sentence:',probs[i][0])
    print('Actual Label:',labs_test[i])
    print('Predicted Label:',probs[i][1])
    print('Predicted Label Probability:', max(probs[i][2]),'\n')

### Scoring News Articles with Baseline Niave Bayes


### LSTM Model


### Office hour questions

We're training a language model on the Brown corpus first, which has its vocabulary. Then, we want to train an LSTM to classify text into liberal, conservative, and neutral buckets using initialized parameters from the Brown-trained language model. 
1. The Brown vocabulary is likely slightly different than the vocabulary of our ideological training data, which is different from news articles that we are going to score later on. Is this a problem? If so, how do we deal with this to build a unified vocabulary?
2. What variables do we need to carry over from the Brown language model? Just final h? What about word embeddings (W in)?
3. How to replace the softmax layer from whole vocabulary to three classes?


In [41]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("0.12"))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import vocabulary, utils, utils_ideo

# LSTM code
import rnnlm
import rnnlm_ideo

### Preprocessing data FINAL MODEL

In [9]:
# Loading Brown data to train language model to initialize LSTM parameters
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=10000, shuffle=42)

Loaded 57340 sentences (1.16119e+06 tokens)
Training set: 45872 sentences (924077 tokens)
Test set: 11468 sentences (237115 tokens)


In [10]:
# Smaller Brown test set to make sure model works
mini_train_ids = train_ids[0:1000]
mini_test_ids = test_ids[0:1000]

In [11]:
# Loading ideological data
vocab_ideo, train_ids_ideo, test_ids_ideo, train_labs_ideo, test_labs_ideo = utils_ideo.process_data(
    data_all, labs_all, split=0.8, V=10000)

Loaded 22621 sentences (2.83457e+06 tokens)
Training set: 18096 sentences (2261010 tokens)
Test set: 4525 sentences (573561 tokens)


In [12]:
# Smaller ideo test set to make sure model works
mini_train_ids_ideo = train_ids_ideo[0:100]
mini_test_ids_ideo = test_ids_ideo[0:100]
mini_train_labs_ideo = train_labs_ideo[0:100]
mini_test_labs_ideo = test_labs_ideo[0:100]

### Running the LSTM

In [29]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.

        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        # Reshape targets to be one long vector
        y = y.reshape([-1,1])
        feed_dict = {lm.input_w_: w,
                    lm.learning_rate_ : learning_rate,
                    lm.initial_h_ : h,
                    lm.target_y_ : y}
        cost, step, h = session.run([loss, train_op, lm.final_h_], feed_dict = feed_dict)

        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print ("[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker
            
    final_cost = total_cost / total_batches
    
    return final_cost, h

In [30]:
def run_epoch_classification(lm, session, batch_iterator, final_h,
              train=False, verbose=False,tick_s=10, 
              learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0
    #total_logits = np.empty([0,1,10000])

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        
        if i == 0:
            h = final_h # final state passed from language model

        # Reshape targets to be one long vector
        y = y.reshape([-1,1])
        feed_dict = {lm.input_w_: w,
                    lm.learning_rate_ : learning_rate,
                    lm.initial_h_ : h,
                    lm.target_y_ : y}
        cost, step, h = session.run([loss, train_op, lm.final_h_], feed_dict = feed_dict)
        
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time
        #total_logits = np.append(total_logits,logits, axis = 0)
        
        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print ("[batch %d]: seen %d words at %d wps, loss = %.3f" % (i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    final_cost = total_cost / total_batches
            
    return final_cost

In [31]:
def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.batch_generator(ids, batch_size=100, max_time=100)
    cost, final_state = run_epoch(lm, session, bi, learning_rate=1.0, train=False, verbose=False, tick_s=3600)
    print ("%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost)))

# specify classification labels
def score_dataset_ideo(cm, session, ids, labels, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.batch_generator_ideology(ids, labels, batch_size=50, max_time=100)
    cost = run_epoch_classification(cm, session, bi, final_state, learning_rate=1.0, train=False, verbose=False, tick_s=3600)
    print ("%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost)))

In [32]:
## Set up language model and parameters

TF_GRAPHDIR = "tf_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
num_epochs = 5

# Model parameters
model_params = dict(V=vocab.size, H=200, softmax_ns=200, num_layers=2)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "language_model")
trained_filename = os.path.join(TF_SAVEDIR, "language_model_trained")

In [39]:
####
# GENERAL LANGUAGE MODEL for initializing parameters
####

# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.batch_generator(mini_train_ids, batch_size, max_time)
        print ("[epoch %d] Starting epoch %d" % (epoch, epoch))

        # Run a training epoch.        
        cost, final_state = run_epoch(lm, session, bi, learning_rate=learning_rate, 
                         train=True, verbose=False, tick_s=3600)
        
        print ("[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
    print (("[epoch %d]" % epoch), score_dataset(lm, session, mini_train_ids, name="Train set"))
    print (("[epoch %d]" % epoch), score_dataset(lm, session, mini_test_ids, name="Test set"))
    print ("")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:00:00
[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:00:00
[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:00:00
[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:00:00
[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:00:00
Train set: avg. loss: 7.539  (perplexity: 1880.33)
[epoch 5] None
Test set: avg. loss: 7.798  (perplexity: 2436.20)
[epoch 5] None



In [40]:
####
# CLASSIFICATION MODEL for predicting liberal, neutral, conservative
####


with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    # Restore variables from language model
    saver.restore(session, trained_filename)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()

        bi = utils.batch_generator_ideology(mini_train_ids_ideo,mini_train_labs_ideo, batch_size, max_time)
        print ("[epoch %d] Starting epoch %d" % (epoch, epoch))

        # Run a training epoch.        
        cost = run_epoch_classification(lm, session, bi, final_state, learning_rate=learning_rate,
                         train=True, verbose=False, tick_s=3600)
        
        print ("[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
    print (("[epoch %d]" % epoch), score_dataset_ideo(lm, session, mini_train_ids_ideo,mini_train_labs_ideo, name="Train set"))
    print (("[epoch %d]" % epoch), score_dataset_ideo(lm, session, mini_test_ids_ideo,mini_test_labs_ideo, name="Test set"))
    print ("")
    
    # Save final model
    saver.save(session, trained_filename)


[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:00:00
[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:00:00
[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:00:00
[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:00:00
[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:00:00
Train set: avg. loss: 2.000  (perplexity: 7.39)
[epoch 5] None
Test set: avg. loss: 2.733  (perplexity: 15.38)
[epoch 5] None



## FINAL MODEL END

In [None]:
## Set up classification model and parameters

import rnnlm_ideo

TF_GRAPHDIR = "tf_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
num_epochs = 5

# Model parameters
model_params = dict(C = 3, V=10000, H=200, softmax_ns=200, num_layers=2)

cm = rnnlm_ideo.RNNLM(**model_params)
cm.BuildCoreGraph()
cm.BuildTrainGraph()
cm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, cm.graph)
TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "classification_model")
trained_filename = os.path.join(TF_SAVEDIR, "classification_model_trained")

In [None]:
####
# CLASSIFICATION MODEL for predicting liberal, neutral, conservative
####

# Will print status every this many seconds
print_interval = 5

# Explicitly add global initializer and variable saver to LM graph
with cm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)


with tf.Session(graph=cm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer) #IS THIS RIGHT??

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        # ADD LABELS HERE
        bi = utils.batch_generator_ideology(mini_train_ids_ideo,mini_train_labs_ideo, batch_size, max_time)
        print ("[epoch %d] Starting epoch %d" % (epoch, epoch))

        # Run a training epoch.        
        cost, final_logits = run_epoch_classification(cm, session, bi, state, learning_rate=learning_rate,
                         train=True, verbose=False, tick_s=3600)
        
        print ("[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
    print (("[epoch %d]" % epoch), score_dataset_ideo(cm, session, mini_train_ids_ideo,mini_train_labs_ideo, name="Train set"))
    print (("[epoch %d]" % epoch), score_dataset_ideo(cm, session, mini_test_ids_ideo,mini_test_labs_ideo, name="Test set"))
    print ("")
    
    # Save final model
    saver.save(session, trained_filename)


## Building pipeline for sentences

In [None]:
import re
import time
import itertools
import numpy as np

# For pretty-printing
import pandas as pd
from IPython.display import display, HTML
import jinja2

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

HIGHLIGHT_BUTTON_TMPL = jinja2.Template("""
<script>
colors_on = true;
function color_cells() {
  var ffunc = function(i,e) {return e.innerText {{ filter_cond }}; }
  var cells = $('table.dataframe').children('tbody')
                                  .children('tr')
                                  .children('td')
                                  .filter(ffunc);
  if (colors_on) {
    cells.css('background', 'white');
  } else {
    cells.css('background', '{{ highlight_color }}');
  }
  colors_on = !colors_on;
}
$( document ).ready(color_cells);
</script>
<form action="javascript:color_cells()">
<input type="submit" value="Toggle highlighting (val {{ filter_cond }})"></form>
""")

RESIZE_CELLS_TMPL = jinja2.Template("""
<script>
var df = $('table.dataframe');
var cells = df.children('tbody').children('tr')
                                .children('td');
cells.css("width", "{{ w }}px").css("height", "{{ h }}px");
</script>
""")

def render_matrix(M, rows=None, cols=None, dtype=float,
                        min_size=30, highlight=""):
    html = [pd.DataFrame(M, index=rows, columns=cols,
                         dtype=dtype)._repr_html_()]
    if min_size > 0:
        html.append(RESIZE_CELLS_TMPL.render(w=min_size, h=min_size))

    if highlight:
        html.append(HIGHLIGHT_BUTTON_TMPL.render(filter_cond=highlight,
                                             highlight_color="yellow"))

    return "\n".join(html)
    
def pretty_print_matrix(*args, **kwargs):
    """Pretty-print a matrix using Pandas.
    Optionally supports a highlight button, which is a very, very experimental
    piece of messy JavaScript. It seems to work for demonstration purposes.
    Args:
      M : 2D numpy array
      rows : list of row labels
      cols : list of column labels
      dtype : data type (float or int)
      min_size : minimum cell size, in pixels
      highlight (string): if non-empty, interpreted as a predicate on cell
      values, and will render a "Toggle highlighting" button.
    """
    html = render_matrix(*args, **kwargs)
    display(HTML(html))


def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


##
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

##
# Data loading functions
import nltk
import vocabulary

def get_corpus(name="brown"):
    return nltk.corpus.__getattr__(name)

def sents_to_tokens(sents, vocab):
    """Returns an flattened list of the words in the sentences, with normal padding."""
    padded_sentences = (["<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([canonicalize_word(w, wordset=vocab.wordset)
                     for w in flatten(padded_sentences)], dtype=object)

def build_vocab(corpus, V=10000):
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

def get_train_test_sents(corpus, split=0.8, shuffle=True):
    """Get train and test sentences.
    Args:
      corpus: nltk.corpus that supports sents() function
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.
    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    sentences = np.array(corpus.sents(), dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))
    print ("Loaded %d sentences (%g tokens)" % fmt)

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print ("Training set: %d sentences (%d tokens)" % fmt)
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print ("Test set: %d sentences (%d tokens)" % fmt)

    return train_sentences, test_sentences

def preprocess_sentences(sentences, vocab):
    """Preprocess sentences by canonicalizing and mapping to ids.
    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Add sentence boundaries, canonicalize, and handle unknowns
    words = flatten(["<s> "] + s + [" </s>"] for s in sentences)
    words = [canonicalize_word(w, wordset=vocab.word_to_id)
             for w in words]
    return np.array(vocab.words_to_ids(words))

##
# Use this function
def load_corpus(name, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences."""
    corpus = get_corpus(name)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids




In [None]:
# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

In [None]:
# I don't think we actually want to flatten sentences....

def sents_to_tokens(sents, vocab):
    """Returns an flattened list of the words in the sentences, with normal padding."""
    padded_sentences = (["<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([canonicalize_word(w, wordset=vocab.wordset)
                     for w in flatten(padded_sentences)], dtype=object)

def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))

def build_vocab(corpus, V=10000):
    words = []
    for i in range(0,corpus.shape[0]):
        words += corpus[i].split()
    token_feed = (canonicalize_word(w) for w in words)
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

def get_train_test_sents(corpus, ideo_labs, split=0.8):
    """Get train and test sentences.
    Args:
      corpus: nltk.corpus that supports sents() function
      split (double): fraction to use as training set
      shuffle (int or bool): seed for shuffle of input data, or False to just
      take the training data as the first xx% contiguously.
    Returns:
      train_sentences, test_sentences ( list(list(string)) ): the train and test
      splits
    """
    # Get sentences
    sentences = []
    for i in range(0,corpus.shape[0]):
        sentences.append(corpus[i])
        
    fmt = (len(sentences), sum(map(len, sentences)))
    print ("Loaded %d sentences (%g tokens)" % fmt)

    # Split into test and train
    train_frac = split
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]
    
    
    # Map: Liberal --> (1), Neutral --> (2), Conservative --> (3)
    # Map: Liberal --> (1,0,0), Neutral --> (0,1,0), Conservative --> (0,0,1)
    labels = []
    for i in range(0, ideo_labs.shape[0]):
        if ideo_labs[i] == 'Liberal':
            labels.append(1.)
            #labels.append([1.,0.,0.])
        elif ideo_labs[i] == 'Conservative':
            labels.append(2.)            
            #labels.append([0.,0.,1.])
        else:
            labels.append(3.)
            #labels.append([0.,1.,0.])
    labels = np.array(labels)
    # Split into test and train
    train_labels = labels[:split_idx]
    test_labels = labels[split_idx:]
            

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print ("Training set: %d sentences (%d tokens)" % fmt)
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print ("Test set: %d sentences (%d tokens)" % fmt)
    
    return train_sentences, test_sentences, train_labels, test_labels

def preprocess_sentences(sentences, vocab):
    """Preprocess sentences by canonicalizing and mapping to ids.
    Args:
      sentences ( list(list(string)) ): input sentences
      vocab: Vocabulary object, already initialized
    Returns:
      ids ( array(int) ): flattened array of sentences, including boundary <s>
      tokens.
    """
    # Add sentence boundaries, canonicalize, and handle unknowns
    flat_sentences = flatten(["<s> "] + [s] + [" </s>"] for s in sentences)
    words = []
    for i in range(0, len(flat_sentences)):
        words += flat_sentences[i].split()
    words = [canonicalize_word(w, wordset=vocab.word_to_id) for w in words]
    return np.array(vocab.words_to_ids(words))

def process_data(data, labs, split=0.8, V=10000):
    """Load and split train/test along sentences in dataset."""
    vocab = build_vocab(data, V)
    train_sentences, test_sentences, train_labels, test_labels = get_train_test_sents(data, labs, split=0.8)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids, train_labels, test_labels


In [None]:
def load_corpus(name, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences."""
    corpus = get_corpus(name)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids

In [None]:
def load_data(corpus_name, data, split=0.8, V=10000, shuffle=0):
    """Load a named corpus and split train/test along sentences."""
    corpus = get_corpus(name)
    vocab = build_vocab(corpus, V)
    train_sentences, test_sentences = get_train_test_sents(corpus, split, shuffle)
    train_ids = preprocess_sentences(train_sentences, vocab)
    test_ids = preprocess_sentences(test_sentences, vocab)
    return vocab, train_ids, test_ids

In [None]:
# brown
def build_vocab(corpus, V=10000):
    token_feed = (canonicalize_word(w) for w in corpus.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

In [None]:
# ideo
def build_vocab(data, V=10000):
    words = []
    for i in range(0,data.shape[0]):
        words += data[i].split()
    token_feed = (canonicalize_word(w) for w in words)
    vocab = vocabulary.Vocabulary(token_feed, size=V)
    return vocab

In [None]:
from shared_lib import vocabulary_new

In [None]:
# combined

def build_vocab(corpus, data, V=10000):
    brown_tokens = (canonicalize_word(w) for w in corpus.words())
    
    words = []
    for i in range(0,data.shape[0]):
        words += data[i].split()    
    ibc_tokens = (canonicalize_word(w) for w in words)
    
    vocab = vocabulary_new.CombinedVocabulary(brown_tokens,ibc_tokens, size=V)
    
    return vocab


In [None]:
corpus = get_corpus('brown')

In [None]:
vocab = build_vocab(corpus, data_all, 10000)

In [None]:
vocab

## Building an LSTM based off of the CNN code

In [None]:
# Load data
x_raw, y = clean_data_and_labels(data_all, labs_all)

# Map data into vocabulary
max_sentence_len = len(max(data_all, key=len).split())
vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_len)
x = np.array(list(vocab_processor.fit_transform(x_raw)))

# Split up vocabulary
split = int(0.9*x.shape[0])
x_train, x_dev = x[:split], x[split:]
y_train, y_dev = y[:split], y[split:]

In [None]:
model = word2vec.Word2Vec.load_word2vec_format('/Users/megan/Downloads/GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
# Creating embeddings of pre-trained word vectors

# Initialize start, stop, and unk words randomly
start = np.random.rand(300,)
stop = np.random.rand(300,)
unk = np.random.rand(300,)
embeddings = np.vstack((start, stop, unk))

# Loop through words and pull initialized embeddings
for i in range(3, len(vocab.ordered_words())):
    try:
        vector = model.wv[vocab.ordered_words()[i]]
    except KeyError: # the word does not have a pre-initialized vector
        vector = np.random.rand(300,) #initialize randomly
    
    embeddings = np.vstack((embeddings,vector))

embeddings.shape

In [None]:
# Define helper functions

def matmul3d(X, W):
    """Wrapper for tf.matmul to handle a 3D input tensor X.
    Will perform multiplication along the last dimension.
    Args:
      X: [m,n,k]
      W: [k,l]
    Returns:
      XW: [m,n,l]
    """
    Xr = tf.reshape(X, [-1, tf.shape(X)[2]])
    XWr = tf.matmul(Xr, W)
    newshape = [tf.shape(X)[0], tf.shape(X)[1], tf.shape(W)[1]]
    return tf.reshape(XWr, newshape)


def MakeFancyRNNCell(H, keep_prob, num_layers=1):
    """Make a fancy RNN cell.
    Use tf.nn.rnn_cell functions to construct an LSTM cell.
    Initialize forget_bias=0.0 for better training.
    Args:
      H: hidden state size
      keep_prob: dropout keep prob (same for input and output)
      num_layers: number of cell layers
    Returns:
      (tf.nn.rnn_cell.RNNCell) multi-layer LSTM cell with dropout
    """
    cell = tf.nn.rnn_cell.BasicLSTMCell(H, forget_bias=0.0)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
    cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
    return cell

In [None]:
import tensorflow as tf

# Defining the graph
class initialized_LSTM(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, sequence_length, num_classes, 
                 vocab_size,embedding_size, filter_sizes, 
                 num_filters, embedding):
        
        # Placeholders for inputs and graph pieces
        self.input_w = tf.placeholder(tf.int32, [None, None], name="w")
        self.target_y = tf.placeholder(tf.int32, [None, None], name="y")
        self.initial_h = None
        self.final_h = None
        self.logits = None
        self.loss = None

        # Get dynamic shape info from inputs
        with tf.name_scope("batch_size"):
            self.batch_size = tf.shape(self.input_w)[0]
        with tf.name_scope("max_time"):
            self.max_time = tf.shape(self.input_w)[1]

        self.ns = tf.tile([self.max_time], [self.batch_size, ], name="ns")
    
    
        with tf.device('/cpu:0'), tf.name_scope("Embedding_Layer"):
            W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]),
                trainable=True, name="W")
            self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
            self.embedding_init = W.assign(self.embedding_placeholder)
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_w)
            #self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        with tf.name_scope("Hidden_Layer"):
            self.cell = MakeFancyRNNCell(embedding_size, keep_prob=0.5, num_layers=3)
            self.initial_h = self.cell.zero_state(batch_size = 64, dtype = tf.float32)
            self.outputs, self.final_h = tf.nn.dynamic_rnn(cell = self.cell, inputs=self.embedded_chars,
                                                           initial_state = self.initial_h, 
                                                           sequence_length = self.ns)
        

        with tf.name_scope("Output_Layer"):
            self.b_out = tf.Variable(tf.zeros([num_classes,], dtype=tf.float32), name="b_out")
            self.W_out = tf.Variable(tf.random_uniform([embedding_size,num_classes],0,1.0), name="W_out")
            # Calculate logits
            self.logits = tf.add(matmul3d(self.outputs, self.W_out), self.b_out, name="logits")
    
        
        # Add dropout
        with tf.name_scope("Dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
            
        # Define outputs
        with tf.name_scope("Output_Layer"):
            W = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        
        
        # Calculate mean cross-entropy loss
        with tf.name_scope("Cost_Function"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            self.loss = tf.reduce_mean(losses)
            
        
        # Calculate Accuracy to compare to other models
        with tf.name_scope("Accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


In [None]:
# Building the graph

with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        cnn = initialized_CNN(
            sequence_length=x_train.shape[1],
            num_classes=3,
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=300,
            filter_sizes=map(int, '3,4,5'.split(",")),
            num_filters=128,
            embedding = embeddings
        )

        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-4)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpointing
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")

        # Tensorflow assumes this directory already exists so we need to create it
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())

        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: embeddings})
        sess.run(tf.global_variables_initializer())

In [None]:
# Defining an epoch
def train_epoch(x_batch, y_batch):
    """
    A single training epoch
    """
    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: 0.5
    }
    _, step, summaries, loss, accuracy = sess.run(
        [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
        feed_dict)
    #time_str = datetime.datetime.now().isoformat()
    #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    train_summary_writer.add_summary(summaries, step)

def dev_epoch(x_batch, y_batch, writer=None):
    """
    Evaluates model on a dev set
    """
    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: 1.0
    }
    step, summaries, loss, accuracy, predictions = sess.run(
        [global_step, dev_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
        feed_dict)
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    if writer:
        writer.add_summary(summaries, step)
    return predictions

In [None]:
# Function to generate batches
def batch_generator(data, labels, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data_size = len(data)
    
    data = np.array(data)
    labels = np.array(labels)
    
    num_batches_per_epoch = int((data_size-1)/batch_size) + 1
    for epoch in range(num_epochs):
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data[start_index:end_index], labels[start_index:end_index]

In [None]:
# Generate batches
batches = batch_generator(x_train, y_train, batch_size = 64, num_epochs = 5)

# Run model with a training loop
for batch in batches:
    x_batch, y_batch = batch
    train_epoch(x_batch, y_batch)
    current_step = tf.train.global_step(sess, global_step)
    if current_step % 50 == 0: # evaluate every 50 steps
        print("\nEvaluation:")
        dev_epoch(x_dev, y_dev, writer=dev_summary_writer)
        print("")
    if current_step % 100 == 0: # checkpoint every 100 steps
        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
        print("Saved model checkpoint to {}\n".format(path))