In [None]:
import pandas as pd
import numpy as np
from TextCleaner import *

@inproceedings{wang2018eann,
  title={EANN: Event Adversarial Neural Networks for Multi-Modal Fake News Detection},
  author={Wang, Yaqing and Ma, Fenglong and Jin, Zhiwei and Yuan, Ye and Xun, Guangxu and Jha, Kishlay and Su, Lu and Gao, Jing},
  booktitle={Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
  pages={849--857},
  year={2018},
  organization={ACM}
}

https://arxiv.org/abs/1803.11175

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [4]:
df.shape

(20800, 5)

In [None]:
def decoder(code, n_code, phase_train):
    with tf.variable_scope("decoder"):
        with variable_scope("hidden_1"):
            hidden_1 = layer(code, [n_code, n_decoder_hidden_1], 
                            [n_decoder_hidden_1],  phase_train)
            
        with tf.variable_scope("hidden_2"):
            hidden_2 = layer(hidden_1, [n_decoder_hidden_1])

In [5]:
# -*- coding: utf-8 -*-

from __future__ import division

import tensorflow as tf
import numpy as np
import logging
import json
import os

class TextAutoencoder(object):
    """
    Class that encapsulates the encoder-decoder architecture to
    reconstruct pieces of text.
    """

    def __init__(self, lstm_units, embeddings, go, train=True,
                 train_embeddings=False, bidirectional=True):
        """
        Initialize the encoder/decoder and creates Tensor objects

        :param lstm_units: number of LSTM units
        :param embeddings: numpy array with initial embeddings
        :param go: index of the GO symbol in the embedding matrix
        :param train_embeddings: whether to adjust embeddings during training
        :param bidirectional: whether to create a bidirectional autoencoder
            (if False, a simple linear LSTM is used)
        """
        # EOS and GO share the same symbol. Only GO needs to be embedded, and
        # only EOS exists as a possible network output
        self.go = go
        self.eos = go

        self.bidirectional = bidirectional
        self.vocab_size = embeddings.shape[0]
        self.embedding_size = embeddings.shape[1]
        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        # the sentence is the object to be memorized
        self.sentence = tf.placeholder(tf.int32, [None, None], 'sentence')
        self.sentence_size = tf.placeholder(tf.int32, [None],
                                            'sentence_size')
        self.l2_constant = tf.placeholder(tf.float32, name='l2_constant')
        self.clip_value = tf.placeholder(tf.float32, name='clip')
        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        self.dropout_keep = tf.placeholder(tf.float32, name='dropout_keep')

        self.decoder_step_input = tf.placeholder(tf.int32,
                                                 [None],
                                                 'prediction_step')

        name = 'decoder_fw_step_state_c'
        self.decoder_fw_step_c = tf.placeholder(tf.float32,
                                                [None, lstm_units], name)
        name = 'decoder_fw_step_state_h'
        self.decoder_fw_step_h = tf.placeholder(tf.float32,
                                                [None, lstm_units], name)
        self.decoder_bw_step_c = tf.placeholder(tf.float32,
                                                [None, lstm_units],
                                                'decoder_bw_step_state_c')
        self.decoder_bw_step_h = tf.placeholder(tf.float32,
                                                [None, lstm_units],
                                                'decoder_bw_step_state_h')

        with tf.variable_scope('autoencoder') as self.scope:
            self.embeddings = tf.Variable(embeddings, name='embeddings',
                                          trainable=train_embeddings)

            initializer = tf.glorot_normal_initializer()
            self.lstm_fw = tf.nn.rnn_cell.LSTMCell(lstm_units,
                                                   initializer=initializer)
            self.lstm_bw = tf.nn.rnn_cell.LSTMCell(lstm_units,
                                                   initializer=initializer)

            embedded = tf.nn.embedding_lookup(self.embeddings, self.sentence)
            embedded = tf.nn.dropout(embedded, self.dropout_keep)

            # encoding step
            if bidirectional:
                bdr = tf.nn.bidirectional_dynamic_rnn
                ret = bdr(self.lstm_fw, self.lstm_bw,
                          embedded, dtype=tf.float32,
                          sequence_length=self.sentence_size,
                          scope=self.scope)
            else:
                ret = tf.nn.dynamic_rnn(self.lstm_fw, embedded,
                                        dtype=tf.float32,
                                        sequence_length=self.sentence_size,
                                        scope=self.scope)
            _, self.encoded_state = ret
            if bidirectional:
                encoded_state_fw, encoded_state_bw = self.encoded_state

                # set the scope name used inside the decoder.
                # maybe there's a more elegant way to do it?
                fw_scope_name = self.scope.name + '/fw'
                bw_scope_name = self.scope.name + '/bw'
            else:
                encoded_state_fw = self.encoded_state
                fw_scope_name = self.scope

            self.scope.reuse_variables()

            # generate a batch of embedded GO
            # sentence_size has the batch dimension
            go_batch = self._generate_batch_go(self.sentence_size)
            embedded_eos = tf.nn.embedding_lookup(self.embeddings,
                                                  go_batch)
            embedded_eos = tf.reshape(embedded_eos,
                                      [-1, 1, self.embedding_size])
            decoder_input = tf.concat([embedded_eos, embedded], axis=1)

            # decoding step

            # We give the same inputs to the forward and backward LSTMs,
            # but each one has its own hidden state
            # their outputs are concatenated and fed to the softmax layer
            if bidirectional:
                outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    self.lstm_fw, self.lstm_bw, decoder_input,
                    self.sentence_size, encoded_state_fw, encoded_state_bw)

                # concat fw and bw outputs
                outputs = tf.concat(outputs, -1)
            else:
                outputs, _ = tf.nn.dynamic_rnn(
                    self.lstm_fw, decoder_input, self.sentence_size,
                    encoded_state_fw)

            self.decoder_outputs = outputs

        # now project the outputs to the vocabulary
        with tf.variable_scope('projection') as self.projection_scope:
            # decoder_outputs has shape (batch, max_sentence_size, vocab_size)
            self.logits = tf.layers.dense(outputs, self.vocab_size)

        # tensors for running a model
        embedded_step = tf.nn.embedding_lookup(self.embeddings,
                                               self.decoder_step_input)
        state_fw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_fw_step_c,
                                                 self.decoder_fw_step_h)
        state_bw = tf.nn.rnn_cell.LSTMStateTuple(self.decoder_bw_step_c,
                                                 self.decoder_bw_step_h)
        with tf.variable_scope(fw_scope_name, reuse=True):
            ret_fw = self.lstm_fw(embedded_step, state_fw)
        step_output_fw, self.decoder_fw_step_state = ret_fw

        if bidirectional:
            with tf.variable_scope(bw_scope_name, reuse=True):
                ret_bw = self.lstm_bw(embedded_step, state_bw)
                step_output_bw, self.decoder_bw_step_state = ret_bw
                step_output = tf.concat(axis=1, values=[step_output_fw,
                                                        step_output_bw])
        else:
            step_output = step_output_fw

        with tf.variable_scope(self.projection_scope, reuse=True):
            self.projected_step_output = tf.layers.dense(step_output,
                                                         self.vocab_size)

        if train:
            self._create_training_tensors()

    def _create_training_tensors(self):
        """
        Create member variables related to training.
        """
        eos_batch = self._generate_batch_go(self.sentence_size)
        eos_batch = tf.reshape(eos_batch, [-1, 1])
        decoder_labels = tf.concat([self.sentence, eos_batch], -1)

        projection_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                            scope=self.projection_scope.name)
        # a bit ugly, maybe we should improve this?
        projection_w = [var for var in projection_vars
                        if 'kernel' in var.name][0]
        projection_b = [var for var in projection_vars
                        if 'bias' in var.name][0]

        # set the importance of each time step
        # 1 if before sentence end or EOS itself; 0 otherwise
        max_len = tf.shape(self.sentence)[1]
        mask = tf.sequence_mask(self.sentence_size + 1, max_len + 1, tf.float32)
        num_actual_labels = tf.reduce_sum(mask)
        projection_w_t = tf.transpose(projection_w)

        # reshape to have batch and time steps in the same dimension
        decoder_outputs2d = tf.reshape(self.decoder_outputs,
                                       [-1, tf.shape(self.decoder_outputs)[-1]])
        labels = tf.reshape(decoder_labels, [-1, 1])
        sampled_loss = tf.nn.sampled_softmax_loss(
            projection_w_t, projection_b, labels, decoder_outputs2d, 100,
            self.vocab_size)

        masked_loss = tf.reshape(mask, [-1]) * sampled_loss
        self.loss = tf.reduce_sum(masked_loss) / num_actual_labels

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradients, v = zip(*optimizer.compute_gradients(self.loss))
        gradients, _ = tf.clip_by_global_norm(gradients, self.clip_value)

        self.train_op = optimizer.apply_gradients(zip(gradients, v),
                                                  global_step=self.global_step)

    def get_trainable_variables(self):
        """
        Return all trainable variables inside the model
        """
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    def train(self, session, save_path, train_data, valid_data,
              batch_size, epochs, learning_rate, dropout_keep,
              clip_value, report_interval):
        """
        Train the model

        :param session: tensorflow session
        :param train_data: Dataset object with training data
        :param valid_data: Dataset object with validation data
        :param batch_size: batch size
        :param learning_rate: initial learning rate
        :param dropout_keep: the probability that each LSTM input/output is kept
        :param epochs: how many epochs to train for
        :param clip_value: value to clip tensor norm during training
        :param save_path: folder to save the model
        :param report_interval: report after that many batches
        """
        saver = tf.train.Saver(self.get_trainable_variables(),
                               max_to_keep=1)

        best_loss = 10000
        accumulated_loss = 0
        batch_counter = 0
        num_sents = 0

        # get all data at once. we need all matrices with the same size,
        # or else they don't fit the placeholders
        # train_sents, train_sizes = train_data.join_all(self.go,
        #                                                self.num_time_steps,
        #                                                shuffle=True)

        # del train_data  # save memory...
        valid_sents, valid_sizes = valid_data.join_all(self.go,
                                                       shuffle=True)
        train_data.reset_epoch_counter()
        feeds = {self.clip_value: clip_value,
                 self.dropout_keep: dropout_keep,
                 self.learning_rate: learning_rate}

        while train_data.epoch_counter < epochs:
            batch_counter += 1
            train_sents, train_sizes = train_data.next_batch(batch_size)
            feeds[self.sentence] = train_sents
            feeds[self.sentence_size] = train_sizes

            _, loss = session.run([self.train_op, self.loss], feeds)

            # multiply by len because some batches may be smaller
            # (due to bucketing), then take the average
            accumulated_loss += loss * len(train_sents)
            num_sents += len(train_sents)

            if batch_counter % report_interval == 0:
                avg_loss = accumulated_loss / num_sents
                accumulated_loss = 0
                num_sents = 0

                # we can't use all the validation at once, since it would
                # take too much memory. running many small batches would
                # instead take too much time. So let's just sample it.
                sample_indices = np.random.randint(0, len(valid_data),
                                                   5000)
                validation_feeds = {
                    self.sentence: valid_sents[sample_indices],
                    self.sentence_size: valid_sizes[sample_indices],
                    self.dropout_keep: 1}

                loss = session.run(self.loss, validation_feeds)
                msg = '%d epochs, %d batches\t' % (train_data.epoch_counter,
                                                   batch_counter)
                msg += 'Avg batch loss: %f\t' % avg_loss
                msg += 'Validation loss: %f' % loss
                if loss < best_loss:
                    best_loss = loss
                    self.save(saver, session, save_path)
                    msg += '\t(saved model)'

                logging.info(msg)

    def save(self, saver, session, directory):
        """
        Save the autoencoder model and metadata to the specified
        directory.
        """
        model_path = os.path.join(directory, 'model')
        saver.save(session, model_path)
        metadata = {'vocab_size': self.vocab_size,
                    'embedding_size': self.embedding_size,
                    'num_units': self.lstm_fw.output_size,
                    'go': self.go,
                    'bidirectional': self.bidirectional
                    }
        metadata_path = os.path.join(directory, 'metadata.json')
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f)

    @classmethod
    def load(cls, directory, session, train=False):
        """
        Load an instance of this class from a previously saved one.
        :param directory: directory with the model files
        :param session: tensorflow session
        :param train: if True, also create training tensors
        :return: a TextAutoencoder instance
        """
        model_path = os.path.join(directory, 'model')
        metadata_path = os.path.join(directory, 'metadata.json')
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        dummy_embeddings = np.empty((metadata['vocab_size'],
                                     metadata['embedding_size'],),
                                    dtype=np.float32)

        ae = TextAutoencoder(metadata['num_units'], dummy_embeddings,
                             metadata['go'], train=train,
                             bidirectional=metadata['bidirectional'])
        vars_to_load = ae.get_trainable_variables()
        if not train:
            # if not flagged for training, the embeddings won't be in
            # the list
            vars_to_load.append(ae.embeddings)

        saver = tf.train.Saver(vars_to_load)
        saver.restore(session, model_path)
        return ae

    def encode(self, session, inputs, sizes):
        """
        Run the encoder to obtain the encoded hidden state

        :param session: tensorflow session
        :param inputs: 2-d array with the word indices
        :param sizes: 1-d array with size of each sentence
        :return: a 2-d numpy array with the hidden state
        """
        feeds = {self.sentence: inputs,
                 self.sentence_size: sizes,
                 self.dropout_keep: 1}
        state = session.run(self.encoded_state, feeds)
        if self.bidirectional:
            state_fw, state_bw = state
            return np.hstack((state_fw.c, state_bw.c))
        return state.c

    def run(self, session, inputs, sizes):
        """
        Run the autoencoder with the given data

        :param session: tensorflow session
        :param inputs: 2-d array with the word indices
        :param sizes: 1-d array with size of each sentence
        :return: a 2-d array (batch, output_length) with the answer
            produced by the autoencoder. The output length is not
            fixed; it stops after producing EOS for all items in the
            batch or reaching two times the maximum number of time
            steps in the inputs.
        """
        feeds = {self.sentence: inputs,
                 self.sentence_size: sizes,
                 self.dropout_keep: 1}
        state = session.run(self.encoded_state, feeds)
        if self.bidirectional:
            state_fw, state_bw = state
        else:
            state_fw = state

        time_steps = 0
        max_time_steps = 2 * len(inputs[0])
        answer = []
        input_symbol = self.go * np.ones_like(sizes, dtype=np.int32)

        # this array control which sequences have already been finished by the
        # decoder, i.e., for which ones it already produced the END symbol
        sequences_done = np.zeros_like(sizes, dtype=np.bool)

        while True:
            # we could use tensorflow's rnn_decoder, but this gives us
            # finer control

            feeds = {self.decoder_fw_step_c: state_fw.c,
                     self.decoder_fw_step_h: state_fw.h,
                     self.decoder_step_input: input_symbol,
                     self.dropout_keep: 1}
            if self.bidirectional:
                feeds[self.decoder_bw_step_c] = state_bw.c
                feeds[self.decoder_bw_step_h] = state_bw.h

                ops = [self.projected_step_output,
                       self.decoder_fw_step_state,
                       self.decoder_bw_step_state]
                outputs, state_fw, state_bw = session.run(ops, feeds)
            else:
                ops = [self.projected_step_output,
                       self.decoder_fw_step_state]
                outputs, state_fw = session.run(ops, feeds)

            input_symbol = outputs.argmax(1)
            answer.append(input_symbol)

            # use an "additive" or in order to avoid infinite loops
            sequences_done |= (input_symbol == self.eos)

            if sequences_done.all() or time_steps > max_time_steps:
                break
            else:
                time_steps += 1

        return np.hstack(answer)

    def _generate_batch_go(self, like):
        """
        Generate a 1-d tensor with copies of EOS as big as the batch size,

        :param like: a tensor whose shape the returned embeddings should match
        :return: a tensor with shape as `like`
        """
        ones = tf.ones_like(like)
        return ones * self.go

  from ._conv import register_converters as _register_converters


In [None]:
ae = TextAutoencoder(lstm_units, embeddings, go, train=True, train_embeddings=True, bidirectional=True)

In [None]:
def word2vec(post, word_id_map, W):
    """Cited from https://github.com/yaqingwang/EANN-KDD18
    Generates word2vec embeddings.
    
    INPUTS:
    
    RETURN:
    """
    word_embedding = []
    mask = []
    for sentence in post:
        sen_embedding = []
        seq_len = len(sentence) - 1
        mask_seq = np.zeros(args.sequence_len, dtype=np.float32)
        mask_seq[:len(sentence)] = 1.0
        for i, word in enumerate(sentence):
            sen_embedding.append(word_id_map[word])

        while len(sen_embedding) < args.sequence_len:
            sen_embedding.append(0)
        word_embedding.append(copy.deepcopy(sen_embedding))
        mask.append(copy.deepcopy(mask_seq))
    return word_embedding, mask

In [None]:
import os
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

# read in data and preprocess
train_df = pd.read_csv('c:/users/washburp/documents/kaggle/arthur/data/train_df.csv')

# drop missing descriptions
train_df.dropna(inplace=True, subset=['desc_of_operations'])

# derive length of description & filter long/short ones
train_df['len_description'] = train_df.desc_of_operations.apply(len)
reasonable_sized_texts = (train_df.len_description.astype(int) >= 1) | (train_df.len_description.astype(int) <= 200)
train_df = train_df.loc[reasonable_sized_texts]

# clean up text
from text_cleanup import TextCleaner

train_df['clean_desc'] = TextCleaner().transform(train_df.desc_of_operations.values)
train_df.head()

sentences = []
for description in train_df.clean_desc.tolist():
    sentences.append(description.split())
    
sentences[:1]

import gensim

model = gensim.models.Word2Vec(iter=1, min_count=10, size=150, workers=4)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

topn = 20

dat = model.most_similar(['salon'], topn=topn)
df = pd.DataFrame(dat, columns=['word', 'prob']).set_index('word')
df

In [8]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [13]:
from nltk.tag import DefaultTagger
help(DefaultTagger)

Help on class DefaultTagger in module nltk.tag.sequential:

class DefaultTagger(SequentialBackoffTagger)
 |  A tagger that assigns the same tag to every token.
 |  
 |      >>> from nltk.tag import DefaultTagger
 |      >>> default_tagger = DefaultTagger('NN')
 |      >>> list(default_tagger.tag('This is a test'.split()))
 |      [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')]
 |  
 |  This tagger is recommended as a backoff tagger, in cases where
 |  a more powerful tagger is unable to assign a tag to the word
 |  (e.g. because the word was not seen during training).
 |  
 |  :param tag: The tag to assign to each token
 |  :type tag: str
 |  
 |  Method resolution order:
 |      DefaultTagger
 |      SequentialBackoffTagger
 |      nltk.tag.api.TaggerI
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, tag)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __unicode

In [14]:
help(UnigramTagger)

Help on class UnigramTagger in module nltk.tag.sequential:

class UnigramTagger(NgramTagger)
 |  Unigram Tagger
 |  
 |  The UnigramTagger finds the most likely tag for each word in a training
 |  corpus, and then uses that information to assign tags to new tokens.
 |  
 |      >>> from nltk.corpus import brown
 |      >>> from nltk.tag import UnigramTagger
 |      >>> test_sent = brown.sents(categories='news')[0]
 |      >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
 |      >>> for tok, tag in unigram_tagger.tag(test_sent):
 |      ...     print("(%s, %s), " % (tok, tag))
 |      (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL),
 |      (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT),
 |      (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ),
 |      (primary, NN), (election, NN), (produced, VBD), (``, ``),
 |      (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI),
 |      (irregularities, NNS), (took, VBD), (place, 

In [38]:
txt = df['title'].str.split().astype(str).tolist()
txt[:5]

["['House', 'Dem', 'Aide:', 'We', 'Didn’t', 'Even', 'See', 'Comey’s', 'Letter', 'Until', 'Jason', 'Chaffetz', 'Tweeted', 'It']",
 "['FLYNN:', 'Hillary', 'Clinton,', 'Big', 'Woman', 'on', 'Campus', '-', 'Breitbart']",
 "['Why', 'the', 'Truth', 'Might', 'Get', 'You', 'Fired']",
 "['15', 'Civilians', 'Killed', 'In', 'Single', 'US', 'Airstrike', 'Have', 'Been', 'Identified']",
 "['Iranian', 'woman', 'jailed', 'for', 'fictional', 'unpublished', 'story', 'about', 'woman', 'stoned', 'to', 'death', 'for', 'adultery']"]

In [39]:
from nltk.tag import UnigramTagger, DefaultTagger
from nltk.corpus import treebank
tagger = DefaultTagger('NN')
tagger.tag(txt)

[("['House', 'Dem', 'Aide:', 'We', 'Didn’t', 'Even', 'See', 'Comey’s', 'Letter', 'Until', 'Jason', 'Chaffetz', 'Tweeted', 'It']",
  'NN'),
 ("['FLYNN:', 'Hillary', 'Clinton,', 'Big', 'Woman', 'on', 'Campus', '-', 'Breitbart']",
  'NN'),
 ("['Why', 'the', 'Truth', 'Might', 'Get', 'You', 'Fired']", 'NN'),
 ("['15', 'Civilians', 'Killed', 'In', 'Single', 'US', 'Airstrike', 'Have', 'Been', 'Identified']",
  'NN'),
 ("['Iranian', 'woman', 'jailed', 'for', 'fictional', 'unpublished', 'story', 'about', 'woman', 'stoned', 'to', 'death', 'for', 'adultery']",
  'NN'),
 ("['Jackie', 'Mason:', 'Hollywood', 'Would', 'Love', 'Trump', 'if', 'He', 'Bombed', 'North', 'Korea', 'over', 'Lack', 'of', 'Trans', 'Bathrooms', '(Exclusive', 'Video)', '-', 'Breitbart']",
  'NN'),
 ("['Life:', 'Life', 'Of', 'Luxury:', 'Elton', 'John’s', '6', 'Favorite', 'Shark', 'Pictures', 'To', 'Stare', 'At', 'During', 'Long,', 'Transcontinental', 'Flights']",
  'NN'),
 ("['Benoît', 'Hamon', 'Wins', 'French', 'Socialist', 'Par

In [43]:
import tensorflow as tf
import tensorflow_hub as hub
# import matplotlib.pyplot as plt
# import numpy as np
# import os
# import pandas as pd
# import re
# import seaborn as sns

messages = txt
def fetch_universal_sentence_embeddings(messages, verbose=0):
    """Fetches universal sentence embeddings from Google's
    research paper https://arxiv.org/pdf/1803.11175.pdf.
    
    INPUTS:
    RETURNS:
    """
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

    # Import the Universal Sentence Encoder's TF Hub module
    embed = hub.Module(module_url)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(messages))
        embeddings = list()
        for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
            if verbose:
                print("Message: {}".format(messages[i]))
                print("Embedding size: {}".format(len(message_embedding)))
                message_embedding_snippet = ", ".join(
                    (str(x) for x in message_embedding[:3]))
                print("Embedding: [{}, ...]\n".format(message_embedding_snippet))
            embeddings.append(message_embedding)
    return embeddings

embeddings = fetch_universal_sentence_embeddings(messages)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Message: ['House', 'Dem', 'Aide:', 'We', 'Didn’t', 'Even', 'See', 'Comey’s', 'Letter', 'Until', 'Jason', 'Chaffetz', 'Tweeted', 'It']
Embedding size: 512
Embedding: [0.05387789383530617, 0.006952833849936724, -0.057719238102436066, ...]

Message: ['FLYNN:', 'Hillary', 'Clinton,', 'Big', 'Woman', 'on', 'Campus', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.01003224030137062, -0.008890904486179352, -0.03757377341389656, ...]

Message: ['Why', 'the', 'Truth', 'Might', 'Get', 'You', 'Fired']
Embedding size: 512
Embedding: [0.00695530092343688, 0.043178170919418335, -0.013796758837997913, ...]

Message: ['15', 'Civilians', 'Killed', 'In', 'Single', 'US', 'Airstrike', 'Have', 'Been', 'Identified']
Embedding size: 512
Embedding: [-0.05820205435156822, -0.07245754450559616, -0.04839720577001572, ...]

Message: ['Iranian', 'woman', 'jailed', 'for', 'fictional', 'unpublished', 'story', 'about', 'woman

Message: ['Rand', 'Paul:', 'Polls', 'Showing', 'Hillary', 'Ahead', 'Are', '‘Designed', 'To', 'Suppress', 'Turnout’']
Embedding size: 512
Embedding: [-0.011787991970777512, 0.015412849374115467, -0.058861520141363144, ...]

Message: ['How', 'Voting', 'Machines', 'Are', 'Programmed', 'In', 'Order', 'To', 'Steal', 'Elections']
Embedding size: 512
Embedding: [-0.04308095946907997, -0.0435531847178936, -0.04201938956975937, ...]

Message: ['Redrawing', 'the', 'tree', 'of', 'life:', 'Scientists', 'discover', 'new', 'bacteria', 'groups,', 'stunning', 'microbial', 'diversity', 'underground']
Embedding size: 512
Embedding: [-0.02782958373427391, -0.0174778513610363, 0.008833803236484528, ...]

Message: ['Trump', 'And', 'Putin:', "'We", 'Will', 'Destroy', 'ISIS', 'Once', 'And', 'For', "All!'"]
Embedding size: 512
Embedding: [0.05349254980683327, 0.011652931571006775, -0.029931578785181046, ...]

Message: ['Russia', 'may', 'run', 'out', 'of', 'patience', 'and', 'respond', 'to', "USA's", 'rudeness

Embedding: [-0.04392406344413757, 0.022244630381464958, 0.008687704801559448, ...]

Message: ['7', 'Notable', 'Debuts', 'at', 'Art', 'Basel', 'Miami', 'Beach', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.057103924453258514, -0.008694127202033997, -0.026417279615998268, ...]

Message: ['BREAKING:', 'PLOT', 'TO', 'KILL', 'TRUMP', 'in', 'NEVADA', 'EXPOSED', '-', 'PLEASE', 'PRAY', '-', 'World', 'News', 'Politics']
Embedding size: 512
Embedding: [0.02532152459025383, -0.03661324456334114, -0.043070752173662186, ...]

Message: ['Report:', 'Tamron', 'Hall', 'Leaves', 'NBC', 'News', 'over', 'Megyn', 'Kelly', '-', 'Breitbart']
Embedding size: 512
Embedding: [-0.008304761722683907, 0.005792972166091204, -0.008437398821115494, ...]

Message: ['Toxic', 'cloud', 'continues', 'to', 'spread', 'over', 'Iraq', 'six', 'days', 'after', 'ISIS', 'sets', 'fire', 'to', 'sulfur', 'mine']
Embedding size: 512
Embedding: [0.01706034131348133, 0.037147410213947296, -0.018027259036898613,

Embedding size: 512
Embedding: [-0.04206208884716034, 0.009305217303335667, -0.025117266923189163, ...]

Message: ['News:', 'Incredibly', 'Selfish:', 'The', 'City', 'Of', 'Chicago', 'Wants', 'Its', 'Baseball', 'Team', 'To', 'Win', 'The', 'World', 'Series', 'Even', 'Though', 'It', 'Already', 'Has', 'A', 'Bunch', 'Of', 'Movie', 'Theaters', 'And', 'A', 'Zoo']
Embedding size: 512
Embedding: [0.027721034362912178, 0.03647809848189354, -0.06591811776161194, ...]

Message: ['A', 'New', 'Immigrant', 'Hopes', 'for', 'a', 'Culinary', 'Career', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.04759805649518967, -0.0351860448718071, -0.0072959670796990395, ...]

Message: ['Craig', 'Shirley', 'on', 'Memorial', 'Day:', 'World', 'War', 'II', 'Soldiers', 'and', 'Civilians', 'Made', '‘Ultimate', 'Sacrifice’', 'Without', 'Complaint']
Embedding size: 512
Embedding: [-0.06986575573682785, 0.022919610142707825, -0.015565983019769192, ...]

Message: ['Maddow', 'Raises', 'Possibility', '

Embedding size: 512
Embedding: [-0.0019071829738095403, -0.022255292162299156, -0.03739682212471962, ...]

Message: ['Egypt', 'Sends', 'Submersible', 'in', 'Search', 'for', 'EgyptAir', 'Jet’s', 'Black', 'Boxes', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [-0.033124763518571854, 0.010284188203513622, -0.020568741485476494, ...]

Message: ['Hillary', 'Releases', 'The', 'Most', 'Inspiring', 'Video', 'EVER', 'Chronicling', 'Her', 'Historic', 'Rise', 'To', 'The', 'Top']
Embedding size: 512
Embedding: [-0.05402533337473869, 0.0027141079772263765, -0.0009971905965358019, ...]

Message: nan
Embedding size: 512
Embedding: [-0.04206208884716034, 0.009305217303335667, -0.025117266923189163, ...]

Message: ['Muslims', 'Blast', 'Call', 'To', 'Prayer,', 'So', 'Infidel', 'Mayor', 'Returns', 'With', 'Nasty', 'Surprise']
Embedding size: 512
Embedding: [0.029933318495750427, -0.040036238729953766, -0.029830992221832275, ...]

Message: ['BREAKING', ':', 'Hillary', 'Campaign', 'Man

Embedding: [0.052761249244213104, 0.03968408703804016, -0.03969665616750717, ...]

Message: ['BREAKING', ':', 'HOAX', '“RAPE', 'LAWSUIT”', 'AGAINST', 'TRUMP', 'IS', 'DROPPED', '–', 'TruthFeed']
Embedding size: 512
Embedding: [0.05911078304052353, -0.04113076254725456, -0.04570404812693596, ...]

Message: ['Newsbud', 'Launches', 'Phase', '2', 'Funding', 'Drive']
Embedding size: 512
Embedding: [0.022281277924776077, -0.05143298953771591, -0.035725392401218414, ...]

Message: ['Heavy', 'Snowfall', 'Is', 'Forecast', 'Across', 'Northeastern', 'U.S.', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [-0.050002068281173706, 0.05390077456831932, -0.020918315276503563, ...]

Message: ['Clinton', 'Campaign', 'In', 'FULL', 'PANIC', 'After', 'Bill’s', 'Alleged', 'Son', 'Makes', 'DEMAND', 'That', 'Would', 'HUMILIATE', 'Them']
Embedding size: 512
Embedding: [0.0179609227925539, -0.015153626911342144, -0.041612617671489716, ...]

Message: ['Kejriwal', 'in', 'talks', 'with', 'Dr', 'M

Message: ['Survey:', '89%', 'of', 'PGA', 'Tour', 'Players', 'Would', 'Love', 'to', 'Tee', 'It', 'Up', 'with', 'Trump', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.03390442579984665, -0.008628692477941513, -0.05636066570878029, ...]

Message: ['The', 'David', 'Duke', 'Show:', 'The', 'State', 'of', 'the', 'Campaign,', 'the', 'Synagogue', 'of', 'Satan']
Embedding size: 512
Embedding: [0.01711747609078884, -0.0057924408465623856, -0.016406971961259842, ...]

Message: ['Life:', '9', 'Easy', 'Things', 'You', 'Can', 'Do', 'Every', 'Day', 'To', 'Boost', 'Your…', 'Uh…', 'Cremtine']
Embedding size: 512
Embedding: [-0.014325987547636032, -0.023712454363703728, -0.003415343351662159, ...]

Message: ['Transgender', 'Politics:', 'NC', 'Keeps', 'Core', 'of', 'HB2', 'Sexual', 'Privacy', 'Law', 'While', 'Ending', 'Sports', 'Boycott', '-', 'Breitbart']
Embedding size: 512
Embedding: [-0.02592475898563862, 0.01435226108878851, -0.03839540481567383, ...]

Message: ['More', 'Than', '60,000', 'Migra

Message: ['’We', 'Are', 'Chicago’', 'Attempts', 'to', 'Show', 'Gamers', 'the', 'Trials', 'of', 'Living', 'in', 'Violence-Plagued', 'City', '-', 'Breitbart']
Embedding size: 512
Embedding: [-0.018111426383256912, 0.039069898426532745, -0.0361931212246418, ...]

Message: ['The', 'Best', 'New', 'Podcasts', 'of', '2016', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.03319210559129715, 0.028423627838492393, -0.045898258686065674, ...]

Message: ['U2', 'Needs', '’Space’', 'to', 'Reassess', 'New', 'Album', 'after', 'Trump', 'Win']
Embedding size: 512
Embedding: [0.04770002141594887, 0.030267000198364258, -0.022480472922325134, ...]

Message: ['A', 'Death', 'on', 'Staten', 'Island', 'Highlights', 'Heroin’s', 'Place', 'in', '‘Mainstream', 'Society’', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.02235669456422329, -0.009438679553568363, -0.005944756790995598, ...]

Message: ['Pelosi:', 'GOP', 'Has', '’Anti-Woman', 'Agenda,', 'LGBT', 'Agenda’', '-'

Embedding size: 512
Embedding: [0.007133130449801683, 0.043148525059223175, -0.054347433149814606, ...]

Message: ['Jury', 'acquits', 'leaders', 'of', 'Oregon', 'standoff', 'of', 'federal', 'charges']
Embedding size: 512
Embedding: [-0.04026172310113907, -0.054090481251478195, -0.04331699758768082, ...]

Message: ['Bombing', 'Suspect’s', 'Hometown', 'Is', 'a', 'Magnet', 'for', 'Immigrants', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.04885459691286087, -0.04165073484182358, -0.021776942536234856, ...]

Message: ['Pentagon', 'Releases', 'Photo', 'to', 'Rebut', 'Contention', 'It', 'Bombed', 'a', 'Mosque', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.05591295287013054, -0.01833847537636757, -0.0300139170140028, ...]

Message: ['Supreme', 'Court', 'Vacates', 'Ex-Virginia', 'Governor’s', 'Graft', 'Conviction', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.030717475339770317, -0.05634641647338867, -0.01620849035680294

Embedding size: 512
Embedding: [0.03250059857964516, 0.03254925459623337, -0.02365538664162159, ...]

Message: ['Woman', 'Uses', 'Milk', 'And', 'Gelatin', 'To', 'Remove', 'Blackheads']
Embedding size: 512
Embedding: [-0.02105371095240116, -0.055957529693841934, 0.020328279584646225, ...]

Message: ['Anti-Heroin', 'Video', 'From', 'a', 'Florida', 'Sheriff', 'Appalls', 'Critics', 'but', 'Impresses', 'Constituents', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.040217284113168716, -0.042465727776288986, -0.043492965400218964, ...]

Message: ['‘The', 'Daily’:', 'Making', 'Sense', 'of', 'the', 'Gorsuch', 'Pick', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.054759688675403595, 0.06306815147399902, -0.0164913609623909, ...]

Message: ['Yet', 'Another', 'School', 'Bombing', 'In', 'Syria:', '22', 'Children', 'And', '6', 'Teachers', 'Killed']
Embedding size: 512
Embedding: [-0.043776512145996094, -0.0005035636713728309, -0.022840389981865883, ...]

Embedding: [-0.006963335908949375, 0.03595234826207161, -0.04536598175764084, ...]

Message: ['Ryan:', 'Rand', 'Is', 'Wrong', 'and', '’Looking', 'for', 'a', 'Publicity', 'Stunt’', 'With', 'Obamacare', 'Replacement', 'Criticisms', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.06964025646448135, 0.07128015905618668, -0.03379475697875023, ...]

Message: ['USA', 'Today:', 'NFL', 'Needs', 'Colin', 'Kaepernick', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.06624064594507217, 0.016928348690271378, -0.07270542532205582, ...]

Message: ['NBA', 'Coach', 'Gregg', 'Popovich', 'Still', 'Triggered', 'by', 'Trump', 'Win', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.04359361529350281, -0.03624536097049713, -0.06672653555870056, ...]

Message: ['Прививка', 'против', 'сорокаградусного', 'счастья']
Embedding size: 512
Embedding: [0.045678649097681046, -0.008325929753482342, -0.06925316154956818, ...]

Message: ['Ex-rep:', "'If", 'Trump', 'loses,', "I'm", 'grabbing', 'my', "musket'"]



Message: ['Contaminated', 'food', 'from', 'China', 'now', 'entering', 'the', 'U.S.', 'under', 'the', "'organic'", 'label']
Embedding size: 512
Embedding: [-0.024472082033753395, -0.060979798436164856, -0.011897845193743706, ...]

Message: ['‘Moonlight’', 'Makes', 'a', 'Strong', 'Showing', 'at', 'the', 'Gotham', 'Awards', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.07056662440299988, 0.006357748527079821, -0.013562409207224846, ...]

Message: ['[WATCH]', 'Shameless', 'CNN', 'Reporter', 'Insults', 'Trump,', 'INSTANTLY', 'Regrets', 'That', 'Decision']
Embedding size: 512
Embedding: [0.014358563348650932, 0.00519320135936141, -0.039038754999637604, ...]

Message: ['UPDATE:', 'Yahoo', 'questions', 'US', "govt's", 'alleged', 'order', 'to', 'secretly', 'scan', 'emails,', 'ACLU', 'files', 'motion']
Embedding size: 512
Embedding: [-0.035856664180755615, -0.06388024240732193, -0.04049345850944519, ...]

Message: ['Hillary', 'Clinton,', 'Donald', 'Trump,', 'Mother', 'Te

Message: ['When', 'Solar', 'Panels', 'Became', 'Job', 'Killers', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.04030551388859749, 0.06346646696329117, -0.010954121127724648, ...]

Message: ['Meet', 'Peter', 'Kadzik', '–', 'Another', 'Clinton', 'Stooge', 'at', 'the', 'Justice', 'Department']
Embedding size: 512
Embedding: [0.011443893425166607, 0.02281111292541027, -0.023990148678421974, ...]

Message: ['Giants', 'Senior', 'VP', 'of', 'Communications', 'Uses', 'Obscenity', 'in', 'Tweet', 'Referencing', 'Schilling', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.05842479318380356, 0.027206845581531525, -0.07116800546646118, ...]

Message: ['Neighbors', 'Add', 'To', 'Hillary', 'Sign', 'With', 'Blunt', 'Message', 'Of', 'Their', 'Own', 'Next', 'Door']
Embedding size: 512
Embedding: [0.03629963845014572, -0.027374273166060448, -0.013541480526328087, ...]

Message: ['AIDS', '“Patient', 'Zero”', 'Not', 'the', 'Source', 'of', 'the', 'Outbreak']
Embedding size: 512
E

Message: ['Quandary', 'in', 'South', 'Sudan:', 'Should', 'It', 'Lose', 'Its', 'Hard-Won', 'Independence?', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [-0.00025005798670463264, 0.011795197613537312, -0.03632756322622299, ...]

Message: ['Woman', 'Arrested', 'On', 'Own', 'Property', 'After', 'Her', 'Land', 'Was', 'Stolen', 'By', 'DAPL']
Embedding size: 512
Embedding: [-0.013900822028517723, -0.07594763487577438, 0.002688138745725155, ...]

Message: ['Comment', 'on', 'Europe’s', 'Forgotten', '‘Hitler’', 'Killed', 'Over', '10', 'Million', 'Africans', '—', 'But', 'the', 'West', 'Erased', 'it', 'From', 'History', 'by', 'chris', 'VN']
Embedding size: 512
Embedding: [0.028071679174900055, 0.04628632217645645, -0.024533946067094803, ...]

Message: ['Dentist', 'Waiting', 'Room', 'Contains', 'Disproportionate', 'Number', 'Of', 'Boating', 'Magazines']
Embedding size: 512
Embedding: [-0.008604956790804863, 0.07349908351898193, 0.009429956786334515, ...]

Message: ['Babies', 

Embedding size: 512
Embedding: [0.005673387553542852, -0.04453141242265701, -0.028066562488675117, ...]

Message: ['Rep.', 'Mike', 'Kelly', 'Introduces', 'Bill', 'to', 'Repeal', 'Obamacare’s', 'Cadillac', 'Tax', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.05102084204554558, 0.005017189774662256, -0.059751931577920914, ...]

Message: ['Whoopi', 'Goldberg:', 'Trump', 'Firing', 'Comey', '’Feels', 'Like', 'a', 'Coup’', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.0488155335187912, -0.034380100667476654, -0.049136362969875336, ...]

Message: ['GaiaPortal:', 'Interwebs', 'of', 'Gaia', 'energetics', 'are', 'strengthened']
Embedding size: 512
Embedding: [0.02199569158256054, 0.056622281670570374, -0.041120495647192, ...]

Message: ['Why', 'the', 'Democrats', 'Keep', 'Losing', 'the', 'Congress']
Embedding size: 512
Embedding: [-0.03689219430088997, 0.04537336528301239, -0.06708186119794846, ...]

Message: ['Racists,', 'Misogynists', 'and', 'Homophobes', 'all', 'absolutely', 'deli

Embedding size: 512
Embedding: [-0.04079243168234825, 0.039146069437265396, -0.02878543734550476, ...]

Message: ['A', 'Positive', 'Outlook', 'May', 'Be', 'Good', 'for', 'Your', 'Health', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.07057870924472809, 0.05128025636076927, -0.025855805724859238, ...]

Message: ['US:', '800-900', 'ISIS', '‘Probably’', 'Killed', 'in', 'Mosul', 'Fighting']
Embedding size: 512
Embedding: [-0.02575410157442093, -0.02890102192759514, -0.04892577603459358, ...]

Message: ['Post-Trump', 'Liberal', 'Meltdown:', 'Counseling,', 'Cry-ins,', 'Therapy', 'Dogs', 'and', 'Poetry']
Embedding size: 512
Embedding: [0.017368417233228683, 0.041809678077697754, 0.00034607169800437987, ...]

Message: ['Baton', 'Rouge,', 'Turkey,', 'U.S.', 'Presidential', 'Race:', 'Your', 'Weekend', 'Briefing', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [-0.013767210766673088, 0.043054528534412384, -0.03389980271458626, ...]

Message: ['Tennis', 

Embedding: [0.019566619768738747, -0.01272891741245985, -0.06180369853973389, ...]

Message: ['So', 'THIS', 'Is', 'Where', 'That', 'Infamous', 'Soros', 'Video', 'Came', 'From…']
Embedding size: 512
Embedding: [0.05238419026136398, -0.030809389427304268, -0.03930330649018288, ...]

Message: ['ISIS,', 'Not', 'Russia,', 'Is', 'the', 'Enemy', 'in', 'Syria']
Embedding size: 512
Embedding: [-0.02356562949717045, 0.006479834672063589, -0.04390529543161392, ...]

Message: ['New', 'Type', 'of', 'Emissions', 'Cheating', 'Software', 'May', 'Lurk', 'in', 'Audis', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.038444824516773224, -0.022183723747730255, -0.039564263075590134, ...]

Message: ['Political', 'Party', 'of', 'Anarchists,', 'Libertarians,', 'Hackers,', 'and', 'Computer', 'Geeks', 'to', 'Take', 'Power', 'in', 'Iceland']
Embedding size: 512
Embedding: [-0.0021159162279218435, -0.0266848336905241, -0.030321361497044563, ...]

Message: ['The', 'Best', 'and', 'Worst', 'of


Message: ['Interview', '1222', '–', 'New', 'World', 'Next', 'Week', 'with', 'James', 'Evan', 'Pilato']
Embedding size: 512
Embedding: [0.017476804554462433, -0.05932709947228432, -0.030654599890112877, ...]

Message: nan
Embedding size: 512
Embedding: [-0.04206208884716034, 0.009305217303335667, -0.025117266923189163, ...]

Message: ['Texas', 'Sees', 'Bump', 'in', 'Mumps']
Embedding size: 512
Embedding: [0.06752410531044006, -0.027931712567806244, -0.0779033675789833, ...]

Message: ['The', 'Campaign', 'To', 'Elect', 'Evan', 'McMullin:', 'Is', 'the', 'CIA', 'Interfering', 'In', 'The', 'US', 'Election?']
Embedding size: 512
Embedding: [0.02484641969203949, -0.02033056505024433, -0.05565947666764259, ...]

Message: nan
Embedding size: 512
Embedding: [-0.04206208884716034, 0.009305217303335667, -0.025117266923189163, ...]

Message: ['EU', 'member', 'states', 'approve', 'Canada', 'trade', 'deal']
Embedding size: 512
Embedding: [0.02024685963988304, 0.03254198282957077, -0.0439137108623981

Message: ['Pentagon:', 'No', 'Plan', '‘So', 'Far’', 'For', 'US', 'Troops', 'to', 'Enter', 'Mosul']
Embedding size: 512
Embedding: [-0.01905245892703533, -0.03835413604974747, -0.02379104681313038, ...]

Message: ['Libertarian', 'Party', 'VP', 'insults', 'Trump,', 'practically', 'endorses', 'Clinton']
Embedding size: 512
Embedding: [0.018386252224445343, -0.004714803770184517, -0.06145317107439041, ...]

Message: ['Mortgage', 'Rates’', 'Rise', 'Catches', 'Home', 'Buyers', '—', 'and', 'Lenders', '—', 'Off', 'Guard', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.06289955973625183, -0.01625167205929756, -0.027706550434231758, ...]

Message: ['Airstrikes', 'by', 'Russia', 'Buttress', 'Turkey', 'in', 'Battle', 'vs.', 'ISIS', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.01471577025949955, -0.023414326831698418, -0.024812929332256317, ...]

Message: ['Won,', 'Now', 'What?']
Embedding size: 512
Embedding: [-0.009991065599024296, 0.045043155550956


Message: ['California', 'Lieutenant', 'Governor', 'Wants', 'to', 'Fight', 'Job-Replacing', 'Robots', '-', 'Breitbart']
Embedding size: 512
Embedding: [-0.016112012788653374, 0.008902052417397499, -0.043478112667798996, ...]

Message: ['Steven', 'Seagal', 'receives', 'Russian', 'citizenship', 'on', 'Putin’s', 'personal', 'decision']
Embedding size: 512
Embedding: [0.05580592155456543, -0.061251696199178696, -0.036190565675497055, ...]

Message: ['Surge', 'of', 'Migrants', 'Illegally', 'Crossing', 'U.S.-Mexico', 'Border', 'Ahead', 'of', 'Election']
Embedding size: 512
Embedding: [-0.05895843356847763, -0.06968709826469421, -0.04225531965494156, ...]

Message: ['Benjamin', 'Gilman,', 'a', 'New', 'York', 'Congressman', 'for', '30', 'Years,', 'Dies', 'at', '94', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.06998616456985474, 0.04289734736084938, -0.02734833024442196, ...]

Message: ['‘COLOR', 'REV’', 'AGIT', 'PROP:', 'George', 'Soros', 'MoveOn', 'Agitators', 'March

Embedding size: 512
Embedding: [0.05759740620851517, 0.0696539580821991, 0.013837717473506927, ...]

Message: ['Eight', 'Days', 'to', 'America’s', 'Armageddon']
Embedding size: 512
Embedding: [-0.05880322679877281, 0.03761445730924606, 0.005327424965798855, ...]

Message: ['Court', 'Temporarily', 'Blocks', 'Trump’s', 'Travel', 'Ban,', 'and', 'Airlines', 'Are', 'Told', 'to', 'Allow', 'Passengers', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.022130494937300682, 0.003241425845772028, -0.02298995852470398, ...]

Message: ['Chelsea', 'Clinton', 'to', 'Receive', '’Lifetime', 'Impact', 'Award’', 'from', 'Variety', 'Magazine']
Embedding size: 512
Embedding: [-0.011664018966257572, 0.06042476370930672, 0.010470409877598286, ...]

Message: ['Globalist', 'Media', 'Launches', 'Project', 'to', '’Show', 'Human', 'Side’', 'of', 'Mass', 'Migration']
Embedding size: 512
Embedding: [-0.00017120024131145328, -0.039766162633895874, -0.027238821610808372, ...]

Message: ['How', 'R

Embedding size: 512
Embedding: [0.05906444787979126, -0.0008646159549243748, -0.05332394689321518, ...]

Message: ['Geraldo', 'Rips', 'Yale', 'Student', '’Symbolic', 'Hunger', 'Strike’', '-', 'Everything', 'Wrong', 'With', '’Millennial', 'Activism,’', '’Activism', 'Without', 'Sacrifice’', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.04448329284787178, 0.0261400043964386, -0.04936792701482773, ...]

Message: ['How', 'my', 'family', 'Stopped', 'a', 'Chronic', 'MRSA', 'Infection', 'When', 'Conventional', 'Medicine', 'Failed']
Embedding size: 512
Embedding: [0.027719590812921524, 0.05335047096014023, -0.043037716299295425, ...]

Message: ['It’s', 'Thrilling.', 'It’s', 'Chilling.', 'It’s', 'a', '30-Minute', 'Commercial.', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.029465829953551292, 0.039873044937849045, -0.0423554852604866, ...]

Message: ['Report:', 'Beheadings', 'and', 'and', 'Islamist', 'Stabbings', 'Up', '11-Fold', 'in', 'Five', 'Years']
Embedding siz

Embedding: [0.044653814285993576, 0.02848590351641178, -0.042675066739320755, ...]

Message: ['DELINGPOLE:', 'Why', 'I', 'Totally', 'Hate', 'Big', 'Oil', '-', 'And', 'Why', 'You', 'Should', 'Too...']
Embedding size: 512
Embedding: [0.017588848248124123, 0.06395076215267181, -0.01758665405213833, ...]

Message: ['Smiling,', 'Even', 'When', 'the', 'Oscar', 'Race', 'Is', 'a', 'Losing', 'Marathon', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [-0.01248336210846901, -0.013432234525680542, -0.017008600756525993, ...]

Message: ['Sean', 'Spicer', 'Scrambles', 'to', 'Correct', 'Hitler', 'Comments', 'in', 'White', 'House', 'Press', 'Briefing', '-', 'Breitbart']
Embedding size: 512
Embedding: [0.011265037581324577, 0.018251562491059303, -0.061512481421232224, ...]

Message: ['Duterte', 'Threatens', 'to', 'Bomb', 'Islamic', 'State', 'Hostages:', '’Better', 'Not', 'Get', 'Yourselves', 'Kidnapped’']
Embedding size: 512
Embedding: [0.049272459000349045, -0.06759849190711975, -0


Message: ['Comment', 'on', 'Tutorial:', 'Riding', 'The', 'Philippine', 'Jeepney', 'by', 'Ivan', 'Jose']
Embedding size: 512
Embedding: [0.0040449341759085655, -0.034644488245248795, -0.01367127150297165, ...]

Message: ['Comment', 'on', 'It', 'Is', 'Happening', 'Again', 'America!', 'Voting', 'Machines', 'Are', 'Switching', 'Votes', 'From', 'Donald', 'Trump', 'To', 'Hillary', 'Clinton', 'by', 'donaldbreaux']
Embedding size: 512
Embedding: [-0.005520633887499571, 0.004976568277925253, -0.05401107668876648, ...]

Message: ['Donald', 'Trump', 'Faltering?', 'Die-Hard', 'Fans', 'Refuse', 'to', 'Buy', 'It', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.07841500639915466, 0.06752937287092209, -0.054149605333805084, ...]

Message: ['Re:', '‘Master', 'jackass’?', 'Sean', 'Hannity', 'inadvertently', 'teaches', 'valuable', 'grammar', 'lesson']
Embedding size: 512
Embedding: [0.06362640112638474, 0.08113665878772736, 0.007158089429140091, ...]

Message: ['Chart', 'Of', 'The

Message: ['Report:', 'NBC', 'Holding', '’Crisis', 'Meetings’', 'Over', 'Megyn', 'Kelly-Alex', 'Jones', 'Interview']
Embedding size: 512
Embedding: [-0.05010177567601204, -0.04407579079270363, -0.03002200834453106, ...]

Message: ['FBI:', 'MS-13', 'Is', 'Most', 'Violent,', 'Organized', 'Gang', 'in', 'America']
Embedding size: 512
Embedding: [-0.008156065829098225, -0.07460743933916092, -0.032716941088438034, ...]

Message: ['Dave', 'Brat:', 'H-1B', 'Does', 'Not', 'Put', 'Americans', 'First']
Embedding size: 512
Embedding: [0.04580818861722946, 0.06904789060354233, -0.04340732470154762, ...]

Message: ['Reading', 'Fake', 'News,', 'Pakistani', 'Minister', 'Directs', 'Nuclear', 'Threat', 'at', 'Israel', '-', 'The', 'New', 'York', 'Times']
Embedding size: 512
Embedding: [0.045343879610300064, -0.01730719581246376, -0.020403463393449783, ...]

Message: ['John', 'Bolton:', 'Paris', 'Climate', 'Accord', 'Objective', 'Is', '‘Reduction', 'of', 'National', 'Sovereignty’', 'for', '‘Global', 'Gover

Message: ['Comment', 'on', 'Understanding', 'Misleading', 'Food', 'Labels:', 'An', 'Infographic', 'Breaks', 'Down', 'Which', 'Foods', 'They’re', 'Used', 'On', '&', 'What', 'They', 'Mean', 'by', 'Understanding', 'Misleading', 'Food', 'Labels:', 'An', 'Infographic', 'Breaks', 'Down', 'Which', 'Foods', 'They’re', 'Used', 'On', '&', 'What', 'They', 'Mean', '–', 'Collective', 'Evolution', '-', 'walkertecharts.com']
Embedding size: 512
Embedding: [-0.02479410730302334, -0.02250497229397297, 0.014039723202586174, ...]

Message: ['Hillary', 'Is', 'The', 'Perfection', 'of', 'a', 'Corrupt', 'System']
Embedding size: 512
Embedding: [-0.004386281594634056, 0.02293682098388672, -0.024637259542942047, ...]

Message: ['Have', 'Skin', 'Problems?', 'Take', 'Vitamins…', '[w/', 'Infograph]']
Embedding size: 512
Embedding: [-0.011072778142988682, -0.022590234875679016, 0.006181614939123392, ...]

Message: ['Earth’s', 'Magnetic', 'Field', 'is', 'Collapsing', '—', 'Are', 'the', 'Poles', 'About', 'to', 'Swit

In [44]:
embeddings[0]

[0.05387789383530617,
 0.006952833849936724,
 -0.057719238102436066,
 0.016467709094285965,
 -0.08529816567897797,
 0.007254021242260933,
 0.08881591260433197,
 0.007081518415361643,
 -0.03469545766711235,
 -0.0006816966342739761,
 -0.016090236604213715,
 -0.0009249838767573237,
 -0.0321052148938179,
 0.05761558189988136,
 0.040647782385349274,
 -0.031041890382766724,
 -0.04974964261054993,
 -0.013057475909590721,
 0.03087654896080494,
 0.08251363784074783,
 -0.009422974660992622,
 -0.0753156766295433,
 0.00040966836968436837,
 0.08792632818222046,
 -0.04237255081534386,
 0.04527401179075241,
 -0.009700613096356392,
 -0.03761744126677513,
 0.07941769063472748,
 -0.03204207494854927,
 -0.002098423894494772,
 -0.013898096047341824,
 -0.0315852165222168,
 -0.04550597816705704,
 -0.0643971860408783,
 0.005820169113576412,
 -0.08078315854072571,
 -0.018427792936563492,
 0.038507841527462006,
 -0.03612233325839043,
 -0.014783577993512154,
 0.03647780045866966,
 0.06562419980764389,
 0.083307

In [46]:
df.label.value_counts() / df.shape[0]

1    0.500625
0    0.499375
Name: label, dtype: float64

In [48]:
from sklearn.model_selection import train_test_split
X, y = embeddings, df.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=7)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=.5, random_state=7)



In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=.5, random_state=7)