In [40]:
import cPickle 
import tensorflow as tf
import pandas as pd
import numpy as np
import datetime
import data_helpers
from text_cnn import TextCNN
import re

In [41]:
## read in the training data
with open("../Data/sentence_score_conso10_py2.pk", 'rb') as f:
    data=cPickle.load(f)

In [42]:
data.columns

Index([u'Website', u'Sentence', u'Data Retention', u'Data Security',
       u'Do Not Track', u'First Party Collection/Use',
       u'International and Specific Audiences', u'Not_used', u'Policy Change',
       u'Third Party Sharing/Collection', u'User Access, Edit and Deletion',
       u'User Choice/Control'],
      dtype='object')

In [43]:
data["Sentence"].values
re.split("\s+", ' Privacy Policy    Last Revised: April, 2011'.strip())

['Privacy', 'Policy', 'Last', 'Revised:', 'April,', '2011']

In [4]:
## testing the one-hot generation
## assuming input is a discrete-level array
test=np.array([1,2,2,3,4,2])
nlevels=len(np.unique(test))
correction=np.min(test)
print(nlevels)
testOneHot=np.zeros((len(test), nlevels))
testOneHot[range(len(test)), [int(x)-correction for x in test]]=1
print testOneHot

4
[[ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  1.  0.  0.]]


In [166]:
testOneHot[1::,:]

array([[ 0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.]])

## Helper functions

In [168]:
data.columns

Index([u'Website', u'Sentence', u'Data Retention', u'Data Security',
       u'Do Not Track', u'First Party Collection/Use',
       u'International and Specific Audiences', u'Not_used', u'Policy Change',
       u'Third Party Sharing/Collection', u'User Access, Edit and Deletion',
       u'User Choice/Control'],
      dtype='object')

In [132]:
%%writefile cnn_helpers.py
import pandas as pd
import re
import numpy as np
from tensorflow.contrib import learn
from glove import GloVe

def clean_str(string):
    
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #string = re.sub(r"e-mail", "email", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r":", " ", string)
    
    return string.strip().lower()

def padSentence(wordlist, maximumLength, pad="<PAD>"):
    paddedList=wordlist+[pad]*(maximumLength-len(wordlist))
    return " ".join(paddedList)


def processWord(scoretable, category, trainproportion, seed=120, minscore=-2, maxscore=2):
    """Turns sentences to lists of word indices, padded to maximum sentence length in the corpus.
    Use tf.contrib.learn.VocabProcessor to generate word indices and paddings
    Process all the words first, then split the data into training and development
    """
    
    # Assuming the input label is a one-dimensional discrete-valued array
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    np.random.seed(seed)
    
    ## if select all sentences at once, scores are averaged across categories 
    if category.lower()=="all":  
        
    else:
        categoryFrame=scoretable[["Sentence", category]]
        
    nonmissing=categoryFrame.dropna()
    
    ## equal portion sampling, sample from both positive and negative group
    #shufflednonmissing=nonmissing.sample(frac=1, random_state=seed)   # shuffle the data 
    
    nonmissingNumber=nonmissing.shape[0]

    texts=nonmissing["Sentence"].tolist()
    max_length = max([len(x.split(" ")) for x in texts])
    cleantexts=[clean_str(sent) for sent in texts]
    
    processor = learn.preprocessing.VocabularyProcessor(max_length)
    features=np.array(list(processor.fit_transform(cleantexts)))
    
    labels=nonmissing[category].tolist()
    
    nlevels=maxscore-minscore+1

    correction=minscore
    
    labelsOneHot=np.zeros((len(labels), nlevels))
    labelsOneHot[range(len(labels)), [int(x-correction) for x in labels]]=1
    
    train_index=int(nonmissingNumber*trainproportion)
    random_indices=np.random.permutation(range(nonmissingNumber))
    
    shuffledFeatures=features[random_indices]
    shuffledLabels=labelsOneHot[random_indices]
    
    trainFeatures=shuffledFeatures[0:train_index]
    devFeatures=shuffledFeatures[train_index+1::]
    
    trainLabels=shuffledLabels[0:train_index]
    devLabels=shuffledLabels[train_index+1::]
    
    assert trainFeatures.shape[0]==trainLabels.shape[0], "Number of training features and labels don't match"
    assert devFeatures.shape[0]==devLabels.shape[0], "Number of development features and labels don't match"
    vocabsize=len(processor.vocabulary_)
    return trainFeatures, devFeatures, trainLabels, devLabels, processor
    
    
def processGlove(scoretable, category, trainproportion, gloveFile, seed=120, minscore=-2, maxscore=2):
    """
    generates training features and labels, and a GloVe embedding in the form of a numpy array
    glove is a GloVe object, defined in glove.py
    """
    
    # Assuming the input label is a one-dimensional discrete-valued array
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    np.random.seed(seed)
    categoryFrame=scoretable[["Sentence", category]]
    nonmissing=categoryFrame.dropna()
    
    ## equal portion sampling, sample from both positive and negative group
    #shufflednonmissing=nonmissing.sample(frac=1, random_state=seed)   # shuffle the data 
    
    nonmissingNumber=nonmissing.shape[0]

    texts=nonmissing["Sentence"].tolist()
    max_length = max([len(x.split(" ")) for x in texts])
    cleantexts=[clean_str(sent) for sent in texts]
    
    processor = learn.preprocessing.VocabularyProcessor(max_length)
    features=np.array(list(processor.fit_transform(cleantexts)))
    vocabsize=len(processor.vocabulary_)
    
    ## make an embedding matrix
    glove=GloVe(gloveFile)
    embeddings=np.zeros((vocabsize, glove.n_dim))
    mappings=processor.vocabulary_._mapping
    
    for word, index in mappings.items():
        embeddings[index]=glove[word]

    ## Generate one-hot labels
    labels=nonmissing[category].tolist()
    nlevels=maxscore-minscore+1
    correction=minscore
    
    labelsOneHot=np.zeros((len(labels), nlevels))
    labelsOneHot[range(len(labels)), [int(x-correction) for x in labels]]=1
    
    train_index=int(nonmissingNumber*trainproportion)
    random_indices=np.random.permutation(range(nonmissingNumber))
    
    shuffledFeatures=features[random_indices]
    shuffledLabels=labelsOneHot[random_indices]
    
    trainFeatures=shuffledFeatures[0:train_index]
    devFeatures=shuffledFeatures[train_index+1::]
    
    trainLabels=shuffledLabels[0:train_index]
    devLabels=shuffledLabels[train_index+1::]
    
    assert trainFeatures.shape[0]==trainLabels.shape[0], "Number of training features and labels don't match"
    assert devFeatures.shape[0]==devLabels.shape[0], "Number of development features and labels don't match"
    
    return trainFeatures, devFeatures, trainLabels, devLabels, embeddings, processor
    
    
    


def batch_iter(train, label, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """

    data = np.array(list(zip(train, label)))
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

if __name__=="__main__":
    pass

Overwriting cnn_helpers.py


In [48]:
processor = learn.preprocessing.VocabularyProcessor(150)
processor.fit_transform(data["Sentence"])

<generator object transform at 0x7f2529f38a50>

In [71]:
mapping=processor.vocabulary_._mapping
for key, value in mapping.items():
    if value==0:
        print key

<UNK>


## GloVe Embeddings

In [88]:
from glove import GloVe
import glove
reload(glove)
glove100=glove.GloVe("/home/nyao/Embeddings/glove.6B.100d.mini.txt")

sortedvocab=sorted(glove100.vocab.keys())

In [106]:
import cnn_helpers
reload(cnn_helpers)
train_x, dev_x, train_y, dev_y, embeddings, processor=cnn_helpers.processGlove(data, "Data Security", 0.85, "/home/nyao/Embeddings/glove.6B.100d.mini.txt")

In [107]:
len(processor.vocabulary_)

2014

### BOW vectors

In [155]:
def processBow(scoretable, category, trainproportion, regionsize, seed=120, minscore=-2, maxscore=2):
    """
    generates training features and labels, and a GloVe embedding in the form of a numpy array
    glove is a GloVe object, defined in glove.py
    """
    
    # Assuming the input label is a one-dimensional discrete-valued array
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    np.random.seed(seed)
    categoryFrame=scoretable[["Sentence", category]]
    nonmissing=categoryFrame.dropna()
    
    ## equal portion sampling, sample from both positive and negative group
    #shufflednonmissing=nonmissing.sample(frac=1, random_state=seed)   # shuffle the data 
    
    nonmissingNumber=nonmissing.shape[0]
    
    texts=nonmissing["Sentence"].tolist()
    max_length = max([len(x.split(" ")) for x in texts])
    cleantexts=[clean_str(sent) for sent in texts]
    
    processor = learn.preprocessing.VocabularyProcessor(max_length)
    features=np.array(list(processor.fit_transform(cleantexts)))
    vocabsize=len(processor.vocabulary_)
    
    ## make an embedding matrix
    glove=GloVe(gloveFile)
    embeddings=np.zeros((vocabsize, glove.n_dim))
    mappings=processor.vocabulary_._mapping
    
    for word, index in mappings.items():
        embeddings[index]=glove[word]

    ## Generate one-hot labels
    labels=nonmissing[category].tolist()
    nlevels=maxscore-minscore+1
    correction=minscore
    
    labelsOneHot=np.zeros((len(labels), nlevels))
    labelsOneHot[range(len(labels)), [int(x-correction) for x in labels]]=1
    
    train_index=int(nonmissingNumber*trainproportion)
    random_indices=np.random.permutation(range(nonmissingNumber))
    
    shuffledFeatures=features[random_indices]
    shuffledLabels=labelsOneHot[random_indices]
    
    # generate a BOW matrix:
    
    
    trainFeatures=shuffledFeatures[0:train_index]
    devFeatures=shuffledFeatures[train_index+1::]
    
    trainLabels=shuffledLabels[0:train_index]
    devLabels=shuffledLabels[train_index+1::]
    
    assert trainFeatures.shape[0]==trainLabels.shape[0], "Number of training features and labels don't match"
    assert devFeatures.shape[0]==devLabels.shape[0], "Number of development features and labels don't match"
    
    return trainFeatures, devFeatures, trainLabels, devLabels, embeddings, processor

### Basic CNN Implementation

In [264]:
test=np.array([[1,2,3,4,2],[2,3,4,22,1], [3,4,2,3,2]])
testsess=tf.Session()
X=tf.placeholder(tf.int32, [None, 5])
y=tf.expand_dims(X, -1)
print(test.shape)
with testsess.as_default():
    out=testsess.run([y], feed_dict={X:test})
print(np.array(out).shape)

(3, 5)
(1, 3, 5, 1)


In [190]:
%%writefile cnn_net_glove.py
import tensorflow as tf
import numpy as np

class CNN(object):
    """
    convolutional neural net obejct for label classification
    Use pre-trained GloVe embeddings
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        #self.input_suprise = tf.placeholder(tf.float32, [None, 1], name="input_surprise")

        # L2 loss function
        l2_loss = tf.constant(0.0)

        # Embedding layer
        #with tf.device('/cpu:0'), tf.name_scope("embedding"):
        with tf.name_scope("embedding"):
            W = tf.Variable(
                tf.constant(0.0, shape=[vocab_size, embedding_size]),
                name="Embedding_Weights", trainable=False)
            self.embedding_placeholder=tf.placeholder(tf.float32, shape=[vocab_size, embedding_size])
            self.embedding_init=W.assign(self.embedding_placeholder)
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
        
        # Convolution + maxpool layer give each filter
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],        #strides
                    padding="VALID",
                    name="conv")

                # Relu
                relu = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")

                # Max pooling, works better
                pooled = tf.nn.max_pool(
                    relu,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                
                # Average pooling
    
#                 pooled = tf.nn.avg_pool(
#                     relu,
#                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
#                     strides=[1, 1, 1, 1],
#                     padding='VALID',
#                     name="pool")
        
#                 pooled_outputs.append(pooled)


        num_filters_total = num_filters * len(filter_sizes) 
        self.h_pool = tf.concat(pooled_outputs,3 )
        #features = tf.concat(1, [self.input_suprise, self.h_pool])

        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Initial prediction
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)

            self.xw_out = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.xw_out, 1, name="predictions")


        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.xw_out, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss


        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

Overwriting cnn_net_glove.py


In [186]:
%%writefile cnn_glove_train.py
from __future__ import division
import tensorflow as tf
import cPickle
import argparse
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
from sklearn.metrics import f1_score
from cnn_net_glove import CNN
import cnn_helpers
from sklearn.metrics import average_precision_score
import logging
import json
import sys

timestamp = str(int(time.time()))

parser=argparse.ArgumentParser(description="Arguments for CNN training")
parser.add_argument("InputData", help="Path to the input pickle file", action="store")
parser.add_argument("Category", help="Name of the category to focus on")
parser.add_argument("embedding_File", default=None, action="store",help="Path to the embedding file")

parser.add_argument("--train-proportion",default=0.85, type=float, dest="train_proportion", help="Proportion of the data to be used as training set, the rest is used as development set, default is 0.85")
parser.add_argument("--filter-size", type=int, default=[2,3,5],dest="filter_sizes", nargs="+",help="Sizes of the filters, can specify mutiple sizes for multiple channels. Default is [2,3,5]")
parser.add_argument("--model-name", default="CNN", dest="model_name", help="Name the model, this will be the folder name under which everything is saved. Default is \"CNN\" followed by category code and timestamp")
parser.add_argument("--num-filters", default=100, dest="num_filters", help="Number of filters per channel, default 100")
parser.add_argument("--dropout-keep-prob", default=0.5, dest="dropout_keep_prob",type=float, help="Drop out keep probability, default 0.5")
parser.add_argument("--l2", default=1.0, type=float, dest="l2", help="L2 regularization parameter, default 1.0")
parser.add_argument("--batch-size", default=100, type=int, dest="batch_size", help="Batch size, default 100")
parser.add_argument("--num-epochs", default=100, type=int, dest="num_epoch", help="Number of epochs, default 100")
parser.add_argument("--print-every", default=100, type=int, dest="print_every", help="Print after how many steps, default is 100")
parser.add_argument("--save-every", default=500, type=int, dest="save_every", help="Save a checkpoint after every N steps, default is 500. A final checkpoint will also be saved after training is finished")
parser.add_argument("--eval-every", default=500, type=int, dest="eval_every", help="Evalute the model after every N steps, default is 500")
parser.add_argument("--descriptor", default="CNN", dest="descriptor", help="A short descriptor for the model, will be used in the performace records")



args=parser.parse_args()

InputData=args.InputData
Category=args.Category  # category to focus on
train_proportion=args.train_proportion

categoryCode={'Data Retention':"DR", 'Data Security':"DS", 
              'Do Not Track':"DNT", 'First Party Collection/Use':"FP", 'International and Specific Audiences':"INT",
              'Not_used':"NU", 'Policy Change':"PC", 'Third Party Sharing/Collection':"TP", 
              'User Access, Edit and Deletion':"UA",'User Choice/Control':"UC"}

#CNN  paramters

embeddingFile=args.embedding_File  ## path to the embedding file
FS = args.filter_sizes #Filter Sizes
Num_F = args.num_filters  #Number of filters per filter size
dropout_keep_prob = args.dropout_keep_prob
L = args.l2
batch_size = args.batch_size
num_epoch = args.num_epoch
print_every=args.print_every
save_every=args.save_every
eval_every=args.eval_every

# naming
try:
    catCode=categoryCode[Category]
except KeyError:
    print("Input category does not exist")
    sys.exit()
#
model_name=args.model_name+"_"+catCode+"_"+timestamp[-4::]

descriptor=args.descriptor
## define logging control

if embeddingFile is None:
    print("No embedding file found, please enter the path")
    embeddingFile=input("Enter full path to embedding file : ")
    
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", model_name))

if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
logger = logging.getLogger(model_name)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(os.path.join(out_dir, model_name+".log"))
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter1 = logging.Formatter('%(asctime)s %(levelname)s : %(message)s')
formatter2 = logging.Formatter('%(asctime)s %(levelname)s : %(message)s')

ch.setFormatter(formatter1)
fh.setFormatter(formatter2)

logger.addHandler(fh)
logger.addHandler(ch)

logger.info("CNN classifier using GloVe embeddings, with AdamOptimizer")
logger.info("Reading data from {}".format(InputData))
logger.info("Training models to predict scores the category of \"{}\" ".format(Category))

logger.info("Filter sizes : {}".format(FS))
logger.info("Number of filtes : {}" .format(Num_F))
logger.info("Dropout keep probability : {}" .format(dropout_keep_prob))
logger.info("L2 regularization constant : {}" .format(L))
logger.info("Batch size : {}" .format(batch_size))
logger.info("Number of epochs : {}".format(num_epoch))


with open(InputData, 'rb') as f:
    data=cPickle.load(f)
    
## Generate training data
logger.info("Preparing data and embeddings")
train_x, dev_x, train_y, dev_y, embeddings, processor=cnn_helpers.processGlove(data,Category , train_proportion, embeddingFile)
vocabsize=len(processor.vocabulary_)
M=embeddings.shape[1]   # embedding size

logger.info("Embedding matrix shape : {}".format(embeddings.shape))
logger.info("Vocabulary size : {}".format(vocabsize))
logger.info("Training sample size : {}".format(train_x.shape[0]))
logger.info("Development sample size : {}".format(dev_x.shape[0]))

totalsteps=train_x.shape[0]*num_epoch/batch_size

processorfile=os.path.join(out_dir, model_name+"_vocab.pk")
logger.info("Saving vocabulary processor to {}".format(processorfile))
with open("processorfile", "wb") as fv:
    cPickle.dump(processor, fv)

with tf.Graph().as_default():

    sess = tf.Session()
    with sess.as_default():
        cnn = CNN(
            sequence_length=train_x.shape[1],
            num_classes=train_y.shape[1],
            vocab_size=vocabsize,
            embedding_size = M,
            filter_sizes = FS,
            num_filters = Num_F,
            l2_reg_lambda = L)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        
        optimizer = tf.train.AdamOptimizer(1e-3) 
        
        # AdamOptimizer achieves much better results than Adadelta
        #optimizer = tf.train.AdadeltaOptimizer(0.001)
        
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
        
        # Output directory for checkpoints and summaries
        
        logger.info("Writing to {}\n".format(out_dir))
       
        # Train Summaries

        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        init_op=tf.global_variables_initializer()
        saver = tf.train.Saver()
        
        # Initialize all variables
        sess.run(init_op)
        # feed the embedding matrix
        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: embeddings})
        
        def train_step(x_batch, y_batch, printevery, logger=logger):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            _, step,loss, accuracy = sess.run(
                [train_op, global_step, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            if step % printevery == 0:
                logger.info("step {} / {}, loss {:g}, acc {:g}".format( step, int(totalsteps), loss, accuracy))


        def dev_step(x_batch_dev, y_batch_dev, logger=logger):
            """
            Evaluates model on a dev set
            """
        
            print("\nDev Set Evaluation:")
            feed_dict = {
              cnn.input_x: x_batch_dev,
              cnn.input_y: y_batch_dev,
              cnn.dropout_keep_prob: dropout_keep_prob
            }
            step, loss, scores,prediction = sess.run(
                [global_step, cnn.loss,cnn.xw_out, cnn.predictions],
                feed_dict)
            
            f1=f1_score(y_batch_dev.argmax(axis=1), prediction, average='micro')
            time_str = datetime.datetime.now().isoformat()
            logger.info("step {} / {}, loss {:g}, dev f1-score {}".format(step,int(totalsteps), loss, f1))
            return f1


        # Generating batches
        batches = cnn_helpers.batch_iter(
            train_x, train_y, batch_size, num_epoch)
        
        # Training executions
        devbatches=cnn_helpers.batch_iter(dev_x, dev_y, 100, 1)
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch, printevery=print_every)
            
            current_step = tf.train.global_step(sess, global_step)
            if current_step % eval_every == 0:
                #log.write("\nDev Set Evaluation:\n")
                dev_f1=dev_step(dev_x, dev_y)
                
            if current_step % save_every==0:
                
                logger.info("Saving checkpoint to {}/model_{}.ckpt".format(checkpoint_dir, current_step))
                save_path = saver.save(sess, "{}/model{}.ckpt".format(checkpoint_dir, current_step))
                
                performance={}
                performance["Embedding size"]=M
                performance["Filter number"]=Num_F
                performance["Filter sizes"]=",".join([str(x) for x in FS])
                performance["Dropout keep probability"]=dropout_keep_prob
                performance["Regularization"]=L
                performance["Batch size"]=batch_size
                performance["Development f1"]=dev_f1
                performance["Model Type"]=descriptor
                performance["Model Name"]=model_name
                
                performancefile=os.path.join(out_dir, model_name+"_performance.json")
                logger.info("Saving performances to {}".format(performancefile))
                with open(performancefile, "w") as fp:
                    json.dump(performance, fp)


                
                
        current_step = tf.train.global_step(sess, global_step)
        dev_f1=dev_step(dev_x, dev_y)
        logger.info("Saving checkpoint to {}/model_{}.ckpt".format(checkpoint_dir, current_step))
        save_path = saver.save(sess, "{}/model_{}.ckpt".format(checkpoint_dir, current_step))
        
        

        with open(performancefile, "w") as fp:
            json.dump(performance, fp)



Overwriting cnn_glove_train.py


In [191]:
%%writefile train.sh
# python cnn_glove_train.py "../Data/sentence_score_conso10_py2.pk" "Data Security" \
# "/home/nyao/Embeddings/glove.6B.100d.txt"  --descriptor "CNN_glove" --num-epochs 100 --print-every 50 --eval-every 100 \
# --model-name "CNN_glove_Adam_"

python cnn_glove_train.py "../Data/sentence_score_conso10_py2.pk" 'User Choice/Control' \
"/home/nyao/Embeddings/glove.6B.100d.txt"  --descriptor "CNN_glove" --num-epochs 500 --print-every 200 \
--eval-every 600  --model-name "CNN_glove" --filter-size 3 4 5  --dropout-keep-prob 0.8 --save-every 600
    
python cnn_glove_train.py "../Data/sentence_score_conso10_py2.pk" 'User Choice/Control' \
"/home/nyao/Embeddings/glove.6B.100d.txt"  --descriptor "CNN_glove" --num-epochs 500 --print-every 200 \
--eval-every 600  --model-name "CNN_glove" --filter-size 2 3 4 5   --dropout-keep-prob 0.8 --save-every 600

Overwriting train.sh


In [172]:
list(data.columns)

[u'Website',
 u'Sentence',
 u'Data Retention',
 u'Data Security',
 u'Do Not Track',
 u'First Party Collection/Use',
 u'International and Specific Audiences',
 u'Not_used',
 u'Policy Change',
 u'Third Party Sharing/Collection',
 u'User Access, Edit and Deletion',
 u'User Choice/Control']

###  GloVe embeddings