### Sentence Classification using CNN

In [25]:
import collections
import random
import numpy as np
import tensorflow as tf

### Reading data

In [26]:
fname = 'finegrained.txt'

def read_data(file):
    
    with open(file, 'r') as f:
    
        sent = ['pos', 'neg', 'neu']

        sentiment = []
        review = []

        for sen in f:
            if len(sen.strip()) != 0 :

                row = sen.lower().strip().split()

                if row[0] in sent:

                    sentiment.append(row[0])
                    review.append(row[1:])
    
    return sentiment, review


In [27]:
sent, rev = read_data(fname)

print('max length of sentence:', len(max(rev, key = len)))
print('\nsentiment:',sent[:5])
print('\nreview:\n', rev[:5])

max length of sentence: 155

sentiment: ['neg', 'neg', 'neg', 'neg', 'neg']

review:
 [['the', 'book', 'is', 'disproportionally', 'focused', 'on', 'single', 'and', 'multilayer', 'feedforward', 'networks.'], ['and', 'though', 'the', 'book', 'puts', 'great', 'emphasis', 'on', 'mathematics', 'and', 'even', 'includes', 'a', 'big', 'section', 'on', 'important', 'mathematical', 'background', 'knowledge,', 'it', 'contains', 'to', 'many', 'errors', 'in', 'the', 'mathematical', 'formulas,', 'so', 'they', 'are', 'of', 'little', 'use.'], ['the', 'author', "hasn't", 'even', 'taken', 'the', 'trouble', 'to', 'put', 'up', 'an', 'errata', 'list.'], ['finally,', 'for', 'the', 'beginner', 'there', 'are', 'not', 'enough', 'conceptual', 'clues', 'on', 'what', 'is', 'actually', 'going', 'on', 'and', 'it', 'is', 'hard', 'to', 'form', 'any', 'mental', 'model', 'of', 'the', 'underlying', 'processes.'], ['there', 'are', 'better', 'books.']]


### Padding and shortening the sentences with certain threshold and making the reviews of equal length

In [28]:
threshold = 75

def normalizing_reviews(reviews):
    norm_review = []
    
    for review in reviews:
        if len(review) < threshold:
            norm_review.append(review + ['PAD']*(threshold - len(review)))
        
        elif len(review) > threshold:
            norm_review.append(review[:threshold])
            
        else:
            norm_review.append(review)
            
    return norm_review

In [29]:
norm_rev = normalizing_reviews(rev)

print('review length:', len(norm_rev[0]))
print('\nreviews:\n', norm_rev[:5])

review length: 75

reviews:
 [['the', 'book', 'is', 'disproportionally', 'focused', 'on', 'single', 'and', 'multilayer', 'feedforward', 'networks.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], ['and', 'though', 'the', 'book', 'puts', 'great', 'emphasis', 'on', 'mathematics', 'and', 'even', 'includes', 'a', 'big', 'section', 'on', 'important', 'mathematical', 'background', 'knowledge,', 'it', 'contains', 'to', 'many', 'errors', 'in', 'the', 'mathematical', 'formulas,', 'so', 'they', 'are', 'of', 'little', 'use.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 

### Creating Dictionary

In [30]:
vocab_size = 10000

def create_dict(normalized_reviews):
    global vocab_size
    words = []
    
    for review in normalized_reviews:
        words.extend(review)
    print('length of words:', len(words))
    print('words in the vocabulary: %d'%len(collections.Counter(words).most_common()))

    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocab_size - 1))

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return dictionary, rev_dictionary, count

In [31]:
dictionary, rev_dictionary, count = create_dict(norm_rev)

print('dictionary', list(dictionary)[:10])
print('reverse dictionary', list(rev_dictionary)[:10])
print('most common words:', count[0:5])
print('len of dictionary:', len(dictionary))

length of words: 188625
words in the vocabulary: 11162
dictionary ['1994', 'stickers,', 'herself', 'who', '"unfold"', 'zil...or', 'performance.', 'foreign', 'diminish', 'overall,']
reverse dictionary [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
most common words: [['UNK', -1], ('PAD', 140018), ('the', 2729), ('and', 1311), ('a', 1181)]
len of dictionary: 10000


### Converting str to int

In [32]:
def str_to_int(normalized_reviews, dictionary):
    
    review_int = []
    
    for review in normalized_reviews:
        norm_rev_int = []
        
        for word in review:
            if word in dictionary:
                norm_rev_int.append(dictionary[word])
            else:
                norm_rev_int.append(dictionary['UNK'])
        
        review_int.append(norm_rev_int)
    
    return review_int

In [33]:
rev_int = str_to_int(norm_rev, dictionary)

print('review int[0]:\n', rev_int[0])
print('\nreview int[1]:\n', rev_int[1])

review int[0]:
 [2, 74, 7, 8755, 3065, 17, 318, 3, 3936, 5036, 2354, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

review int[1]:
 [3, 255, 2, 74, 793, 66, 1499, 17, 9868, 3, 57, 870, 4, 244, 1002, 17, 1000, 3039, 1022, 9145, 10, 1551, 5, 91, 1690, 11, 2, 3039, 9041, 29, 30, 18, 6, 92, 808, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Train and Test data

In [34]:
test_indices = random.sample(list(range(len(rev_int))), 500)

train_rev = [rev_int[idx] for idx in range(len(rev_int)) if idx not in test_indices]
test_rev = [rev_int[idx] for idx in test_indices]
             
train_sent = [sent[idx] for idx in range(len(sent)) if idx not in test_indices]
test_sent = [sent[idx] for idx in test_indices]

In [35]:
print('train reviews:', train_rev[0])
print('\ntest reviews:', test_rev[0])
print('\ntrain sentiments:', train_sent[:25])
print('\ntest sentiments:', test_sent[0:25])

train reviews: [2, 74, 7, 8755, 3065, 17, 318, 3, 3936, 5036, 2354, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

test reviews: [9, 379, 22, 369, 16, 22, 4, 274, 2812, 37, 2, 354, 4790, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

train sentiments: ['neg', 'neg', 'neg', 'neg', 'neg', 'neu', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos']

test sentiments: ['neu', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neu', 'neg', 'pos', 'neg', 'pos', 'neg', 'neu', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg']


### Generate batches od data

In [36]:
vocab_size = len(dictionary)
num_labels = 3
sent_length = threshold
sentiments = ['pos', 'neg', 'neu']

def generate_batch(train_review, train_sentiment, batch_size, batch_no):
    global vocab_size, num_labels, sent_length, sentiments
    
    inputs = np.zeros((batch_size, sent_length, vocab_size))
    labels = np.zeros((batch_size, num_labels))
    
    train_review = train_review[batch_no*batch_size : batch_no*batch_size + batch_size]
    train_sentiment = train_sentiment[batch_no*batch_size : batch_no*batch_size + batch_size]
    
    for review_idx, review in enumerate(train_review):
        for idx, word in enumerate(review):
            
            inputs[review_idx, idx, word] = 1
    
    for idx, sent in enumerate(train_sentiment):
        labels[idx, sentiments.index(sent)] = 1
        
        
    return inputs, labels

In [37]:
inputs, labels = generate_batch(train_rev, train_sent, 32, 1)
test_inputs, test_labels = generate_batch(test_rev, test_sent, 32, 1)

print('train labels for a batch:', np.argmax(labels, axis = 1))
print('train labels:\n', labels)

print('test labels for a batch:', np.argmax(test_labels, axis = 1))
print('test labels:\n', test_labels)

train labels for a batch: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1]
train labels:
 [[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
test labels for a batch: [1 0 2 0 1 2 0 1 1 1 1 1 1 0 0 0 2 0 1 0 2 2 0 1 0 1 1 1 1 1 1 1]
test labels:
 [[0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


### Sentence classification using CNN
#### There will be 3 parallel layers in 1 convulation layer and a fully connected layer

### Hyperparameters and inputs

In [38]:
epochs = 50
learning_rate = 0.01
batch_size = 32
# filter sizes in a single convolutional layer 
filter_sizes = [3, 5, 7]

### Inputs

In [39]:
train_inputs = tf.placeholder(tf.float32, shape = [batch_size, sent_length, vocab_size], name = 'train_inputs')
train_labels = tf.placeholder(tf.float32, shape = [batch_size, num_labels])

### Model parameters

#### weights and biases for each parallel layer

In [40]:
w1 = tf.Variable(tf.truncated_normal([filter_sizes[0], vocab_size, 1], stddev = 0.02, dtype = tf.float32), name = 'weights_1')
b1 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype = tf.float32), name = 'bias_1')

w2 = tf.Variable(tf.truncated_normal([filter_sizes[1], vocab_size, 1], stddev = 0.02, dtype = tf.float32), name = 'weights_2')
b2 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype = tf.float32), name = 'bias_2')

w3 = tf.Variable(tf.truncated_normal([filter_sizes[2], vocab_size, 1], stddev = 0.02, dtype = tf.float32), name = 'weights_3')
b3 = tf.Variable(tf.random_uniform([1], 0, 0.01, dtype = tf.float32), name = 'bias_2')

#### weights and biases for fully connected layer

In [41]:
w_fc1 = tf.Variable(tf.truncated_normal([len(filter_sizes), num_labels], stddev = 0.5, dtype = tf.float32), name = 'weights_fc1')
b_fc1 = tf.Variable(tf.random_uniform([num_labels], 0, 0.01, dtype = tf.float32), name = 'bias_fc1')

### Layer computations or Inference

#### convolution part

In [42]:
h_conv1 = tf.nn.relu(tf.nn.conv1d(train_inputs, w1, stride = 1, padding = 'SAME') + b1)
h_conv2 = tf.nn.relu(tf.nn.conv1d(train_inputs, w2, stride = 1, padding = 'SAME') + b2)
h_conv3 = tf.nn.relu(tf.nn.conv1d(train_inputs, w3, stride = 1, padding = 'SAME') + b3)

#### maxpooling part

In [43]:
h_pool1 = tf.reduce_max(h_conv1,axis=1)
h_pool2 = tf.reduce_max(h_conv2,axis=1)
h_pool3 = tf.reduce_max(h_conv3,axis=1)

# concatenating
h_pool = tf.concat([h_pool1, h_pool2, h_pool3], axis = 1)

### Logits

In [44]:
logits = tf.matmul(h_pool, w_fc1) + b_fc1

### Loss

In [45]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = train_labels))

### Optimizer

In [46]:
optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(loss)

### Predictions

In [47]:
predictions = tf.argmax(tf.nn.softmax(logits), axis = 1)

### Run model to classify sentences

In [48]:
session = tf.InteractiveSession()

tf.global_variables_initializer().run()

def accuracy(labels,preds):
    return np.sum(np.argmax(labels,axis=1)==preds)/labels.shape[0]


for epoch in range(epochs):
    avg_loss = []
    
    for step in range((len(train_rev) - 1) // batch_size):
        batch_inputs, batch_labels = generate_batch(train_rev, train_sent, batch_size, step)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        l, _ = session.run([loss, optimizer], feed_dict = feed_dict)
        
        avg_loss.append(l)
        
    print('Train Loss at Epoch %d: %.2f'%(epoch,np.mean(avg_loss)))
    
    test_accuracy = []
    
    for step in range((len(test_rev) - 1) // batch_size):
        batch_test_inputs, batch_test_labels = generate_batch(test_rev, test_sent, batch_size, step)

        feed_dict = {train_inputs: batch_test_inputs, train_labels: batch_test_labels}

        preds = session.run([predictions], feed_dict = feed_dict)

        test_accuracy.append(accuracy(batch_test_labels, preds))
        
    print('Test accuracy at Epoch %d: %.3f'%(epoch,np.mean(test_accuracy)*100.0))



Train Loss at Epoch 0: 1.08
Test accuracy at Epoch 0: 53.542
Train Loss at Epoch 1: 1.03
Test accuracy at Epoch 1: 53.542
Train Loss at Epoch 2: 0.99
Test accuracy at Epoch 2: 53.542
Train Loss at Epoch 3: 0.95
Test accuracy at Epoch 3: 53.542
Train Loss at Epoch 4: 0.91
Test accuracy at Epoch 4: 53.542
Train Loss at Epoch 5: 0.89
Test accuracy at Epoch 5: 53.125
Train Loss at Epoch 6: 0.86
Test accuracy at Epoch 6: 53.542
Train Loss at Epoch 7: 0.84
Test accuracy at Epoch 7: 53.542
Train Loss at Epoch 8: 0.82
Test accuracy at Epoch 8: 53.750
Train Loss at Epoch 9: 0.80
Test accuracy at Epoch 9: 55.000
Train Loss at Epoch 10: 0.78
Test accuracy at Epoch 10: 56.250
Train Loss at Epoch 11: 0.76
Test accuracy at Epoch 11: 55.833
Train Loss at Epoch 12: 0.73
Test accuracy at Epoch 12: 55.208
Train Loss at Epoch 13: 0.71
Test accuracy at Epoch 13: 55.000
Train Loss at Epoch 14: 0.69
Test accuracy at Epoch 14: 55.000
Train Loss at Epoch 15: 0.67
Test accuracy at Epoch 15: 55.000
Train Loss a