### Sentence Classification using CNN

In [148]:
import collections
import random
import numpy as np

### Reading data

In [45]:
fname = 'finegrained.txt'

def read_data(file):
    
    with open(file, 'r') as f:
    
        sent = ['pos', 'neg', 'neu']

        sentiment = []
        review = []

        for sen in f:
            if len(sen.strip()) != 0 :

                row = sen.strip().split()

                if row[0] in sent:

                    sentiment.append(row[0])
                    review.append(row[1:])
    
    return sentiment, review


In [88]:
sent, rev = read_data(fname)

print('max length of sentence:', len(max(rev, key = len)))
print('\nsentiment:',sent[:5])
print('\nreview:\n', rev[:5])

max length of sentence: 155

sentiment: ['neg', 'neg', 'neg', 'neg', 'neg']

review:
 [['The', 'book', 'is', 'disproportionally', 'focused', 'on', 'single', 'and', 'multilayer', 'feedforward', 'networks.'], ['And', 'though', 'the', 'book', 'puts', 'great', 'emphasis', 'on', 'mathematics', 'and', 'even', 'includes', 'a', 'big', 'section', 'on', 'important', 'mathematical', 'background', 'knowledge,', 'it', 'contains', 'to', 'many', 'errors', 'in', 'the', 'mathematical', 'formulas,', 'so', 'they', 'are', 'of', 'little', 'use.'], ['The', 'author', "hasn't", 'even', 'taken', 'the', 'trouble', 'to', 'put', 'up', 'an', 'errata', 'list.'], ['Finally,', 'for', 'the', 'beginner', 'there', 'are', 'not', 'enough', 'conceptual', 'clues', 'on', 'what', 'is', 'actually', 'going', 'on', 'and', 'it', 'is', 'hard', 'to', 'form', 'any', 'mental', 'model', 'of', 'the', 'underlying', 'processes.'], ['There', 'are', 'better', 'books.']]


### Padding and shortening the sentences with certain threshold and making the reviews of equal length

In [74]:
threshold = 50

def normalizing_reviews(reviews):
    norm_review = []
    
    for review in reviews:
        if len(review) < threshold:
            norm_review.append(review + ['PAD']*(threshold - len(review)))
        
        elif len(review) > threshold:
            norm_review.append(review[:threshold])
            
        else:
            norm_review.append(review)
            
    return norm_review

In [89]:
norm_rev = normalizing_reviews(rev)

print('review length:', len(norm_rev[0]))
print('\nreviews:\n', norm_rev[:5])

review length: 50

reviews:
 [['The', 'book', 'is', 'disproportionally', 'focused', 'on', 'single', 'and', 'multilayer', 'feedforward', 'networks.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], ['And', 'though', 'the', 'book', 'puts', 'great', 'emphasis', 'on', 'mathematics', 'and', 'even', 'includes', 'a', 'big', 'section', 'on', 'important', 'mathematical', 'background', 'knowledge,', 'it', 'contains', 'to', 'many', 'errors', 'in', 'the', 'mathematical', 'formulas,', 'so', 'they', 'are', 'of', 'little', 'use.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], ['The', 'author', "hasn't", 'even', 'taken', 'the', 'trouble', 'to', 'put', 'up', 'an', 'errata', 'list.', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD

### Creating Dictionary

In [113]:
vocab_size = 10000

def create_dict(normalized_reviews):
    global vocab_size
    words = []
    
    for review in normalized_reviews:
        words.extend(review)
    print('length of words:', len(words))
    print('words in the vocabulary: %d'%len(collections.Counter(words).most_common()))

    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocab_size - 1))

    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    rev_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

    return dictionary, rev_dictionary, count

In [114]:
dictionary, rev_dictionary, count = create_dict(norm_rev)

print('dictionary', list(dictionary)[:10])
print('reverse dictionary', list(rev_dictionary)[:10])
print('most common words:', count[0:5])
print('len of dictionary:', len(dictionary))

length of words: 125750
words in the vocabulary: 11564
dictionary ['marriage', 'policy', 'Martin.', 'expansion', 'punk', 'summarizes', 'artifact', 'diskette,', 'dimensional.', 'on,']
reverse dictionary [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
most common words: [['UNK', -1], ('PAD', 78662), ('the', 2262), ('and', 1230), ('to', 1101)]
len of dictionary: 10000


### Converting str to int

In [133]:
def str_to_int(normalized_reviews, dictionary):
    
    review_int = []
    
    for review in normalized_reviews:
        norm_rev_int = []
        
        for word in review:
            if word in dictionary:
                norm_rev_int.append(dictionary[word])
            else:
                norm_rev_int.append(dictionary['UNK'])
        
        review_int.append(norm_rev_int)
    
    return review_int

In [143]:
rev_int = str_to_int(norm_rev, dictionary)

print('review int[0]:\n', rev_int[0])
print('\nreview int[1]:\n', rev_int[1])

review int[0]:
 [14, 69, 7, 8014, 2103, 17, 361, 3, 7278, 7111, 2745, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

review int[1]:
 [129, 321, 2, 69, 881, 71, 1453, 17, 7691, 3, 60, 1008, 5, 277, 1753, 17, 978, 2329, 956, 0, 11, 1437, 4, 99, 1838, 9, 2, 2329, 4662, 30, 33, 18, 6, 98, 887, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### Train and Test data

In [152]:
test_indices = random.sample(list(range(len(rev_int))), 500)

train_rev = [rev_int[idx] for idx in range(len(rev_int)) if idx not in test_indices]
test_rev = [rev_int[idx] for idx in test_indices]
             
train_sent = [sent[idx] for idx in range(len(rev_int)) if idx not in test_indices]
test_sent = [sent[idx] for idx in test_indices]

In [157]:
print('train reviews:', train_rev[0])
print('test reviews:', test_rev[0])
print('\ntrain sentiments:', train_sent[:5])
print('test sentiments:', train_sent[0:5])

train reviews: [14, 69, 7, 8014, 2103, 17, 361, 3, 7278, 7111, 2745, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
test reviews: [148, 7, 98, 397, 12, 7, 8586, 31, 1897, 6463, 239, 12, 7920, 2, 1235, 3, 1761, 104, 4, 395, 2962, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

train sentiments: ['neg', 'neg', 'neg', 'neu', 'neg']
test sentiments: ['neg', 'neg', 'neg', 'neu', 'neg']
