In [200]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import csv
import operator
import sys
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
import pickle

In [202]:
'''with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)'''

"with open('tokenizer.pickle', 'wb') as handle:\n    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"

In [2]:
vocab_size = 5000

In [122]:
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ", char_level=False)
# tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=True, split=" ", char_level=False)

In [133]:
train_text = []
with open('data/training_nolabel.txt', 'rt') as trainfile:
    for idx, row in enumerate(trainfile):
        train_text.append(row.rstrip())

In [124]:
tokenizer.fit_on_texts(train_text) # around 30 seconds

In [149]:
X_train = []
y_train = []
with open('data/training_label.txt', 'rt') as trainfile:
    reader = csv.reader(trainfile, delimiter=' ')
    for idx, row in enumerate(reader):                
        words = ' '.join(row[2:]) 
        X_train.append(words)
        y_train.append(row[0])
        # print(row[0])
        # print(words)
        # print(len(words.split()))

### if use word embedding model

In [150]:
X_train = tokenizer.texts_to_sequences(X_train)

In [151]:
# truncate and pad input sequences
max_review_length = 40 # max lengths -- training_label:39    testing_data:39
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
print(X_train.shape) # sanity check

(200000, 40)


In [198]:
# create the model
batch_size = 64
embedding_vector_length = 32
model = Sequential()
# input an integer matrix of size (batch, input_length)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_vector_length, input_length=max_review_length))
# model.add(Dropout(0.2))

# CNN before LSTM layer
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))

# dropout for configuring the input dropout and recurrent_dropout for configuring the recurrent dropout
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

save_model = ModelCheckpoint('punct_models/{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

print(model.summary()) # loss 0.1768
model.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=batch_size, callbacks=[save_model, tensorboard])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 40, 32)            160000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_34 (Dense)             (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 821405 samples, validate on 91268 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5d79d4dcc0>

### if use BOW model

In [119]:
X_train = tokenizer.texts_to_matrix(X_train, mode='count')
print(X_train.shape) # sanity check

(200000, 5000)


In [121]:
# create the model
batch_size = 64
model = Sequential()
epochs = 10

# CNN before LSTM layer
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))

# total param should be around 213,301 to compare with LSTM
model.add(Dense(43, input_shape=X_train[0].shape))
model.add(Dense(43, input_shape=X_train[0].shape))
model.add(Dense(1, activation='sigmoid', input_shape=X_train[0].shape))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

save_model = ModelCheckpoint('punct_BOW_models/{epoch:02d}-{val_acc:.2f}.hdf5', monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
tensorboard = TensorBoard(log_dir='./punct_bow_logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

print(model.summary())
model.fit(X_train, y_train, validation_split=0.1, epochs=epochs, batch_size=batch_size, callbacks=[save_model, tensorboard])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 43)                215043    
_________________________________________________________________
dense_32 (Dense)             (None, 43)                1892      
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 44        
Total params: 216,979
Trainable params: 216,979
Non-trainable params: 0
_________________________________________________________________
None
Train on 180000 samples, validate on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f590907ca90>

## load model

In [126]:
model_name = 'models/03-0.80.hdf5'
loaded_model = load_model(model_name)
print(model_name, 'loaded!')

models/03-0.80.hdf5 loaded!


## append semi-supervised data to training data

In [None]:
train_text = tokenizer.texts_to_sequences(train_text) 
train_text = sequence.pad_sequences(train_text, maxlen=max_review_length)

In [153]:
print(train_text.shape)

(1178614, 40)


In [155]:
preds = loaded_model.predict(train_text, batch_size=10000, verbose=1)



In [177]:
margin = 0.2 # 0.2:0.6

In [179]:
semi_supervised_X = []
semi_supervised_y = []
for idx, pred in enumerate(preds):
    if pred[0] > 1-margin:
        semi_supervised_X.append(train_text[idx])
        semi_supervised_y.append(1)
    elif pred[0] < margin:
        semi_supervised_X.append(train_text[idx])
        semi_supervised_y.append(0)
print('finished appending!')

finished appending!


In [186]:
print(np.array(semi_supervised_X).shape)
print(X_train.shape)

(712673, 40)
(200000, 40)


In [192]:
X_train = np.append(X_train, semi_supervised_X, axis=0) # 200,000 --> 912,673
y_train = np.append(y_train, semi_supervised_y)

In [195]:
y_train.shape

(912673,)

## read test file, generate prediction

In [91]:
x_submission = []

with open('data/testing_data.txt', 'rt') as testfile:
    reader = csv.reader(testfile, delimiter=',')
    next(reader) # skip headings
    for row in reader:
        # print(''.join(row[1:]))
        x_submission.append(''.join(row[1:]))
print('finished reading file')

finished reading file


In [92]:
max_review_length = 40
x_submission = tokenizer.texts_to_sequences(x_submission)   
x_submission = sequence.pad_sequences(x_submission, maxlen=max_review_length)

In [93]:
x_submission[20]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0, 1181,    8,  314,   12,    1,   68,    5,  208,
          1,  812, 1347,  141,  250,  800,   61], dtype=int32)

In [94]:
x_submission.shape

(200000, 40)

In [95]:
preds = loaded_model.predict(x_submission, batch_size=1024, verbose=1)
print('generated predictions with ', loaded_model)

generated predictions with  <keras.models.Sequential object at 0x7f59198abe80>


In [111]:
yeh = tokenizer.texts_to_sequences(['#'])
yeh = sequence.pad_sequences(yeh, maxlen=max_review_length)
loaded_model.predict(yeh, batch_size=1024, verbose=1)



array([[ 0.58626825]], dtype=float32)

## write submission file

In [96]:
with open('punct_first_model.csv', 'wt') as outfile:
    test_writer = csv.writer(outfile)
    test_writer.writerow(['id','label'])
    
    counter = 0
    for i in (preds > 0.5):
        test_writer.writerow([counter, int(i)])
        counter += 1
    
print('finished writing submission!')

finished writing submission!
