In [1]:
import sys
import os
import numpy as np
import pandas as pd
import re
import itertools
import tensorflow as tf
import string
from io import BytesIO
from tensorflow.contrib import learn
from collections import Counter
from time import time
import datetime
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [2]:
d = pd.read_csv("C:/Users/student/Downloads/consumer_complaints2.csv", 
                usecols=('product','consumer_complaint_narrative'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
d=d[d['consumer_complaint_narrative'].notnull()]
d=d[d['product'].notnull()]
d.reset_index(drop=True,inplace=True)
u = d['product']

In [3]:
print ("Data dimensions:", d.shape)
print (d.head())

# Let's see a table of how many examples we have of each product
print ("\nList of Products       Occurrences\n")
print (d["product"].value_counts())

Data dimensions: (66806, 2)
               product                       consumer_complaint_narrative
0     Customer Service  XXXX has claimed I owe them {$27.00} for XXXX ...
1     Customer Payment  Due to inconsistencies in the amount owed that...
2  Internet Connection  In XX/XX/XXXX my wages that I earned at my job...
3  Internet Connection  I have an open and current Internet Connection...
4  Internet Connection  XXXX was submitted XX/XX/XXXX. At the time I s...

List of Products       Occurrences

Billing Issues                20455
Customer Service              17552
Internet Connection           14919
Comcast account or service     5711
Customer Payment               3678
Slow Internet                  2128
Prepaid card                    861
Payday Internet                 726
Money transfers                 666
Other financial service         110
Name: product, dtype: int64


In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning (partially modified)
    """
    string = re.sub(r"[^A-Za-z0-9()!?\'\`%$]", " ", string) # keep also %$ but removed comma
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\$", " $ ", string) #yes, isolate $
    string = re.sub(r"\%", " % ", string) #yes, isolate %
    string = re.sub(r"\s{2,}", " ", string)
    
    # fixing XXX and xxx like as word
    string = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx",string)
    # removing non ascii
    string = re.sub(r'[^\x00-\x7F]+', "", string) 
    
    return string.strip().lower()

In [5]:
word_data=[]
t0 = time()

for message in d['consumer_complaint_narrative']:
    word_data.append(clean_str(message))

In [6]:
an_example = 38
print ("Note: the reference product is",d ['product'][an_example])
print ("\n** Before cleaning ** \n")
print (d['consumer_complaint_narrative'][an_example])
print ("** After cleaning ** \n")
print (word_data [an_example])

Note: the reference product is Customer Service

** Before cleaning ** 

After retaining counsel in XXXX of XXXX due to contact by Javitch & Co. a collection agency in XXXX OH for judgement on a Billing Issues debt I could no longer pay on due to having lost my job in XXXX of XXXX ... said collection agency sent me on consecutive months from XXXX till XXXX, requests fro payments ... Attorney XXXX XXXX XXXX OH contacted said collection firm and asked to set up a payment schedule - debited directly from my Comcast account. Attorney XXXX NEVER had a call returned from XXXX XXXX the contact person on record for this action with Javitch & Co. in XXXX OH. I am on Social Security, as my only source of income. 
Being a seasonal employee at XXXX XXXX in XXXX - I was contacted by XXXX XXXX that my wages would be garnished at 25 % of net pay, when I worked for XXXX XXXX - working for XXXX XXXX from XXXX XXXX to XXXX XXXX, XXXX - I had {$2800.00} garnished from my paychecks - a MAJOR setback for m

In [7]:
max_document_length = max([len(x.split(" ")) for x in word_data])
print ("Max_document_length:",max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
num_data = np.array(list(vocab_processor.fit_transform(word_data)))
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))

Max_document_length: 910
Vocabulary Size: 52925


In [8]:
print ("Check my variables:")
print ("\n* word_data length:", len(word_data))
print ("* num_data length: ", len(num_data)) # once words are numbers

#Create the list of products
product_labels = list(set(d['product']))
print ("\nProducts:")
print ("* data length: ",len(product_labels))
print ("* labels:\n",product_labels)

Check my variables:

* word_data length: 66806
* num_data length:  66806

Products:
* data length:  10
* labels:
 ['Internet Connection', 'Payday Internet', 'Money transfers', 'Slow Internet', 'Customer Payment', 'Prepaid card', 'Customer Service', 'Billing Issues', 'Other financial service', 'Comcast account or service']


In [9]:
np.random.seed(57)
shuffle_indices = np.random.permutation(np.arange(len(num_data)))
x_shuffled = num_data[shuffle_indices]
y_shuffled = d['product'][shuffle_indices]
print ("* x shuffled:", x_shuffled.shape)
print ("* y shuffled:", y_shuffled.shape)

* x shuffled: (66806, 910)
* y shuffled: (66806,)


In [10]:
features_dummy, x_test, labels_dummy, test_labels = model_selection.train_test_split(x_shuffled, y_shuffled, test_size=0.20, random_state= 23)
x_train, x_valid, train_labels, valid_labels = model_selection.train_test_split(features_dummy, labels_dummy, test_size=0.25, random_state= 34)

print('Training set  ',   x_train.shape, train_labels.shape)
print('Validation set',   x_valid.shape, valid_labels.shape)
print('Test set      ',    x_test.shape,  test_labels.shape)

# free some memory
del num_data, d 
del x_shuffled, y_shuffled, labels_dummy, features_dummy

Training set   (40083, 910) (40083,)
Validation set (13361, 910) (13361,)
Test set       (13362, 910) (13362,)


In [11]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [12]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(
      self, sequence_length, num_classes, vocab_size,
      embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):

        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        
        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-128"):
                # Convolution Layer
                filter_shape = [int(filter_size), embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, int(sequence_length) - int(filter_size) + 1, 1, 1],
                    strides=[1, 1, 1, 1], 
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        print(pooled_outputs)
        self.h_pool = tf.concat(pooled_outputs,3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            print (self.scores)
            print (self.input_y)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [13]:
def oneHot(dummy_labels):
    le = preprocessing.LabelEncoder()
    enc = OneHotEncoder()
    
    le.fit (dummy_labels)
    y_dummy = le.fit_transform(dummy_labels)
    y_dummy = y_dummy.reshape(-1, 1)
    enc.fit(y_dummy)
    y_dummy = enc.transform(y_dummy).toarray()
    y_dummy = y_dummy.astype('float32')
    print ("\n * OneHot example")
    print (y_dummy)
    return (y_dummy)
        
y_train = oneHot(train_labels)
y_valid = oneHot(valid_labels)
y_test  = oneHot( test_labels)


 * OneHot example
[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

 * OneHot example
[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]]

 * OneHot example
[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]]


In [14]:
# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=False)
    sess = tf.Session(config=session_conf)
    
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=len(product_labels),
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=128,
            filter_sizes="234",
            num_filters = 128,
            l2_reg_lambda= 0.0)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        #Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries (if needed)
        
        #timestamp = str(int(time()))
        #out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        #print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        #train_summary_dir = os.path.join(out_dir, "summaries", "train")
        #train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        #dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        #dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory (if needed)
        # Tensorflow assumes this directory already exists so we need to create it
        
        #checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        #checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        #if not os.path.exists(checkpoint_dir):
        #    os.makedirs(checkpoint_dir)
        #saver = tf.train.Saver(tf.all_variables())

        # Write vocabulary (if needed)
        #vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.initialize_all_variables())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 0.5
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            
            # Uncomment next print if interested in batch results 
            #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            
            #train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 0.5
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            return loss, accuracy, summaries

        # Generate batches
        batches = batch_iter(
            list(zip(x_train, y_train)), 64, 1)
        
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            
# Validating
# ==================================================
            if current_step % 100 == 0:
                #print("\nEvaluation:")
                
                # Generate batches
                batches_valid = batch_iter(
                    list(zip(x_valid, y_valid)), 64, 1)
                
                loss_valid = 0.
                acc_valid = 0.
                len_batches = 0.
                
                for batch_valid in batches_valid:  
                    
                    x_batch_valid, y_batch_valid = zip(*batch_valid)
                    #aLoss, anAcc, aSummary = dev_step(x_batch_valid, y_batch_valid, writer=dev_summary_writer)
                    aLoss, anAcc, aSummary = dev_step(x_batch_valid, y_batch_valid)
                    loss_valid += aLoss 
                    acc_valid  += anAcc
                    len_batches += 1.
                
                loss_valid = loss_valid / len_batches
                acc_valid  = acc_valid  / len_batches 
                time_str = datetime.datetime.now().isoformat()
                print("Validation set: {}, step {}, loss {:g}, acc {:g}".format(time_str, current_step, loss_valid, acc_valid))
                #dev_summary_writer.add_summary(aSummary, current_step)
                #print("")
                
            #if current_step % FLAGS.checkpoint_every == 0:
            #    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            #    print("Saved model checkpoint to {}\n".format(path))
    
        
# Testing
# ==================================================
        if True:
            print("\n\nTest set:")
            
            # Generate batches
            batches_test = batch_iter(
                list(zip(x_test, y_test)), 64, 1)
        
            loss_test = 0.
            acc_test  = 0.
            len_batches = 0.
            
            for batch_test in batches_test:  
                    
                    x_batch_test, y_batch_test = zip(*batch_test)
                    #aLoss, anAcc, aSummary = dev_step(x_batch_test, y_batch_test, writer=dev_summary_writer)
                    aLoss, anAcc, aSummary = dev_step(x_batch_test, y_batch_test)
                    loss_test += aLoss 
                    acc_test  += anAcc
                    len_batches += 1.
                
            loss_test = loss_test / len_batches
            acc_test  = acc_test  / len_batches 
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, current_step, loss_test, acc_test))
            #dev_summary_writer.add_summary(aSummary, current_step)
            print("")



[<tf.Tensor 'conv-maxpool-128/pool:0' shape=(?, 1, 1, 128) dtype=float32>, <tf.Tensor 'conv-maxpool-128_1/pool:0' shape=(?, 1, 1, 128) dtype=float32>, <tf.Tensor 'conv-maxpool-128_2/pool:0' shape=(?, 1, 1, 128) dtype=float32>]
Tensor("output/scores:0", shape=(?, 10), dtype=float32)
Tensor("input_y:0", shape=(?, 10), dtype=float32)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-128/W:0/grad/hist is illegal; using conv-maxpool-128/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-128/W:0/grad/sparsity is illegal; using conv-maxpool-128/W_0/grad/sparsity instead.
INFO:tensorflow:Su