In [5]:
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, concatenate, Input, Reshape
from keras.layers import LSTM, Conv1D, Conv2D, MaxPooling1D, GRU, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import csv
import os
import errno
import operator
import sys
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
import pickle

In [6]:
'''with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)'''

"with open('tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"

#### stemming

In [7]:
stem_words = False

#### stop words

In [8]:
remove_stop_words = False

In [302]:
stop_words = set()

with open('stopwords.txt', 'rt') as stopfile:
    counter = 0
    for row in stopfile:
        stop_words.add(row.rstrip()) # if already in there, won't add

#### train tokenizer

In [9]:
vocab_size = 5000 # 5000

In [10]:
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)
# tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=True, split=" ", char_level=False)

In [11]:
train_text = []
with open('data/training_nolabel.txt', 'rt') as trainfile:
    for idx, row in enumerate(trainfile):
        words = []
        for word in row.split():
            if stem_words:
                word = stem(word)
                
            if remove_stop_words and word not in stop_words:
                words.append(word)
            else:
                words.append(word)
        # print(' '.join(words))
        train_text.append(' '.join(words))

In [12]:
tokenizer.fit_on_texts(train_text) # around 30 seconds

In [13]:
X_train = []
y_train = []
with open('data/training_label.txt', 'rt') as trainfile:
    reader = csv.reader(trainfile, delimiter=' ')
    for idx, row in enumerate(reader):
        words = []
        for word in row[2:]:
            if stem_words:
                word = stem(word)
            
            if remove_stop_words and word not in stop_words:
                words.append(word)
            else:
                words.append(word)
        words = ' '.join(words)
        X_train.append(words)
        y_train.append(row[0])

### if use word embedding model

In [547]:
X_train = tokenizer.texts_to_sequences(X_train)

In [548]:
X_train[1]

[3, 140, 11, 3, 2502, 8097, 11, 30]

In [549]:
# truncate and pad input sequences
max_review_length = 40 # max lengths -- training_label:39    testing_data:39
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
print(X_train[0].shape) # sanity check

(40,)


#### CNN

In [552]:
dirs = {'model_dir': 'single_test', 'tensorboard_dir': 'single_test'}
file_name = 'larger_vocab_cnn'

for dir_key in dirs:
    if not os.path.exists(os.getcwd() + '/' + dirs[dir_key]):
        try:
            os.makedirs(os.getcwd() + '/' + dirs[dir_key])
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

embedding_vector_length = 20
batch_size = 64

main_input = Input(shape=(None,), dtype='int32', name='main_input')
x = Embedding(input_dim=vocab_size, output_dim=embedding_vector_length, input_length=40)(main_input)

convs_flattened = []
for i in range(3,8):
    tower = Conv1D(filters=100, kernel_size=i, activation='relu')(x)
    # print(tower._keras_shape)
    tower = MaxPooling1D(tower._keras_shape[1])(tower)
    tower = Flatten()(tower)
    convs_flattened.append(tower)

conv_results = concatenate(convs_flattened)
output = Dropout(0.5)(conv_results)
output = Dense(50, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(1, activation='sigmoid')(output)

model = Model(inputs=main_input, outputs=output)


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

save_model = ModelCheckpoint(dirs['model_dir'] + '/' + file_name + '-{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
# tensorboard = TensorBoard(log_dir=dirs['tensorboard_dir'], histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=batch_size, callbacks=[save_model])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, None)         0                                            
__________________________________________________________________________________________________
embedding_110 (Embedding)       (None, 40, 20)       200000      main_input[0][0]                 
__________________________________________________________________________________________________
conv1d_96 (Conv1D)              (None, 38, 100)      6100        embedding_110[0][0]              
__________________________________________________________________________________________________
conv1d_97 (Conv1D)              (None, 37, 100)      8100        embedding_110[0][0]              
__________________________________________________________________________________________________
conv1d_98 

KeyboardInterrupt: 

#### GRU

In [561]:
# create the model
dirs = {'model_dir': 'single_test', 'tensorboard_dir': 'single_test'}
file_name = 'big_conv_pool_GRU_highdrop'

for dir_key in dirs:
    if not os.path.exists(os.getcwd() + '/' + dirs[dir_key]):
        try:
            os.makedirs(os.getcwd() + '/' + dirs[dir_key])
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

batch_size = 64
embedding_vector_length = 20
model = Sequential()

# input an integer matrix of size (batch, input_length)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_vector_length, input_length=max_review_length))

model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(2))
# model.add(Dropout(0.5))

# model.add(Bidirectional(GRU(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(CuDNNGRU(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNGRU(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNGRU(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNGRU(100))

model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))

model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

save_model = ModelCheckpoint(dirs['model_dir'] + '/' + file_name + '-{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
# tensorboard = TensorBoard(log_dir=dirs['tensorboard_dir'], histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=10, batch_size=batch_size, callbacks=[save_model])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_115 (Embedding)    (None, 40, 20)            200000    
_________________________________________________________________
conv1d_117 (Conv1D)          (None, 40, 32)            5152      
_________________________________________________________________
dropout_82 (Dropout)         (None, 40, 32)            0         
_________________________________________________________________
conv1d_118 (Conv1D)          (None, 40, 32)            8224      
_________________________________________________________________
max_pooling1d_100 (MaxPoolin (None, 20, 32)            0         
_________________________________________________________________
conv1d_119 (Conv1D)          (None, 20, 32)            8224      
_________________________________________________________________
dropout_83 (Dropout)         (None, 20, 32)            0         
__________

KeyboardInterrupt: 

### if use BOW model

In [119]:
X_train = tokenizer.texts_to_matrix(X_train, mode='count')
print(X_train.shape) # sanity check

(200000, 5000)


In [121]:
# create the model
batch_size = 64
model = Sequential()
epochs = 10

# CNN before LSTM layer
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))

# total param should be around 213,301 to compare with LSTM
model.add(Dense(43, input_shape=X_train[0].shape))
model.add(Dense(43, input_shape=X_train[0].shape))
model.add(Dense(1, activation='sigmoid', input_shape=X_train[0].shape))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

save_model = ModelCheckpoint('punct_BOW_models/{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
tensorboard = TensorBoard(log_dir='./punct_bow_logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=epochs, batch_size=batch_size, callbacks=[save_model, tensorboard])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 43)                215043    
_________________________________________________________________
dense_32 (Dense)             (None, 43)                1892      
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 44        
Total params: 216,979
Trainable params: 216,979
Non-trainable params: 0
_________________________________________________________________
None
Train on 180000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f590907ca90>

## load model

#### single model

In [29]:
ensemble = False

model_name = '06-0.78.hdf5'
loaded_model = load_model(model_name)
print(model_name, 'loaded!')

06-0.78.hdf5 loaded!


In [30]:
# custom input
max_review_length = 40
x_submission = tokenizer.texts_to_matrix(['today is a good day, but it is hot', 'today is hot, but it is a good day'])   
pred = loaded_model.predict(x_submission)
print(pred)

[[ 0.66471052]
 [ 0.66471052]]


#### ensemble

In [534]:
ensemble = True

models = []
model_names = ['single_test/cnn-03-0.80.hdf5', 'single_test/cnn_GRU-03-0.80.hdf5',
               'single_test/cnn_pool_GRU-02-0.80.hdf5', 'single_test/cudnnGRU-05-0.80.hdf5',
              'single_test/four_layer_GRU-04-0.80.hdf5', 'single_test/two_layer_GRU-05-0.80.hdf5']
for model_name in model_names:
    loaded_model = load_model(model_name)
    models.append(loaded_model)
    print('finished loading', model_name)
print('FINISHED LOADING ALL MODELS!')

finished loading single_test/cnn-03-0.80.hdf5
finished loading single_test/cnn_GRU-03-0.80.hdf5
finished loading single_test/cnn_pool_GRU-02-0.80.hdf5
finished loading single_test/cudnnGRU-05-0.80.hdf5
finished loading single_test/four_layer_GRU-04-0.80.hdf5
finished loading single_test/two_layer_GRU-05-0.80.hdf5
FINISHED LOADING ALL MODELS!


## append semi-supervised data to training data

In [None]:
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = sequence.pad_sequences(train_text, maxlen=max_review_length)

In [153]:
print(train_text.shape)

(1178614, 40)


In [155]:
preds = loaded_model.predict(train_text, batch_size=10000, verbose=1)



In [177]:
margin = 0.2 # 0.2:0.6

In [179]:
semi_supervised_X = []
semi_supervised_y = []
for idx, pred in enumerate(preds):
    if pred[0] > 1-margin:
        semi_supervised_X.append(train_text[idx])
        semi_supervised_y.append(1)
    elif pred[0] < margin:
        semi_supervised_X.append(train_text[idx])
        semi_supervised_y.append(0)
print('finished appending!')

finished appending!


In [186]:
print(np.array(semi_supervised_X).shape)
print(X_train.shape)

(712673, 40)
(200000, 40)


In [192]:
X_train = np.append(X_train, semi_supervised_X, axis=0) # 200,000 --> 912,673
y_train = np.append(y_train, semi_supervised_y)

In [195]:
y_train.shape

(912673,)

## read test file, generate prediction

In [519]:
x_submission = []

with open('data/testing_data.txt', 'rt') as testfile:
    reader = csv.reader(testfile, delimiter=',')
    next(reader) # skip headings
    for row in reader:
        # print(''.join(row[1:]))
        x_submission.append(''.join(row[1:]))
print('finished reading file')

finished reading file


In [520]:
max_review_length = 40
x_submission = tokenizer.texts_to_sequences(x_submission)   
x_submission = sequence.pad_sequences(x_submission, maxlen=max_review_length)
print(x_submission.shape) # sanity check

(200000, 40)


In [568]:
x_submission[100]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   51,   10,   51,   60,
        732,    9,   31,    2,   15, 1722,   35,  237,  630,   38, 1722,
          1,   65,    3,  675,    6,   75, 3513], dtype=int32)

In [569]:
if ensemble:
    preds = np.zeros((x_submission.shape[0], 1))
    for loaded_model in models:
        preds += loaded_model.predict(x_submission, batch_size=1024, verbose=1) / len(models)
        # print('added prediction from', loaded_model)
    print('FINISHED PREDICTION!')
else:
    preds = loaded_model.predict(x_submission, batch_size=1024, verbose=1)
    print('generated predictions with ', loaded_model)

generated predictions with  <keras.models.Sequential object at 0x7f568f4f0dd8>


In [570]:
print(preds)

[[ 0.07925569]
 [ 0.13937001]
 [ 0.05694776]
 ..., 
 [ 0.65190792]
 [ 0.09418858]
 [ 0.16846545]]


## write submission file

In [571]:
with open('biggie_net.csv', 'wt') as outfile:
    test_writer = csv.writer(outfile)
    test_writer.writerow(['id','label'])
    
    counter = 0
    for i in (preds > 0.5):
        test_writer.writerow([counter, int(i)])
        counter += 1
    
print('finished writing submission!')

finished writing submission!
