Text Classification Using CNN

Reference: 
1. Deep Learning for NLP - Jason Brownlee
2. https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/

In [26]:
from nltk.corpus import stopwords
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.visualize_util import plot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from collections import Counter

In [27]:
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file 
    file.close()
    return text


# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [28]:
# load all docs in a directory
def process_docs(directory, vocab, is_train):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
    # skip any reviews in the test set
        if is_train and filename.startswith('cv9'): 
            continue
        if not is_train and not filename.startswith('cv9'): 
            continue
        # create the full path of the file to open
        path = directory + '/' + filename # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

In [29]:
# load and clean a dataset
def load_clean_dataset(vocab, is_train):
    # load documents
    neg = process_docs('review_polarity/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('review_polarity/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]) 
    return docs, labels

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post') 
    return padded

In [36]:
# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length)) 
    #model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) 
    model.add(Conv1D(32, 8, activation='relu')) 
    #model.add(MaxPooling1D(pool_size=2))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # summarize defined model
    model.summary()
    plot(model, to_file='model.png', show_shapes=True)
    return model

In [38]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())
# load training data
train_docs, ytrain = load_clean_dataset(vocab, True) # create the tokenizer
tokenizer = create_tokenizer(train_docs)
# define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# calculate the maximum sequence length
max_length = max([len(s.split()) for s in train_docs]) 
print('Maximum length: %d' % max_length)
# encode data
Xtrain = encode_docs(tokenizer, max_length, train_docs) # define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(Xtrain, ytrain, nb_epoch=10, verbose=2)
# save the model
model.save('model.h5')

Vocabulary size: 25768
Maximum length: 1317
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 1317, 100)     2576800     embedding_input_5[0][0]          
____________________________________________________________________________________________________
convolution1d_4 (Convolution1D)  (None, 1310, 32)      25632       embedding_5[0][0]                
____________________________________________________________________________________________________
maxpooling1d_3 (MaxPooling1D)    (None, 655, 32)       0           convolution1d_4[0][0]            
____________________________________________________________________________________________________
flatten_3 (Flatten)              (None, 20960)         0           maxpooling1d_3[0][0]             
_______________________________________________