In [9]:
import string
import numpy as np
import scipy as sc
import pandas as pd
import pickle
import csv
import random
import matplotlib.pyplot as plt
from sklearn import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.externals import *
from sklearn.utils import shuffle
from sklearn.naive_bayes import *
import tensorflow as tf
from tensorflow import keras
from nltk.corpus import stopwords
from nltk import *
from textblob import TextBlob, Word
import subprocess
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

from helpers import getTokens, preprocessData, create_csv_submission

In [4]:
## Global Variables

## Hyper-Parameters
FEATURE_SPACE = 10 ## number of embedding features
CNN_LAYERS = 4
EPOCHS = 2
BATCH_SIZE = 128
FILTERS = 30
KERNEL_SIZE = 1
DENSE_UNITS = 250
ACTIVATION_FUNCTION = "sigmoid"
CLEAN = False ## determines if we clean the dataset using the cleaning function we have defined

## File Paths
DATA_POS = 'data/train_pos_full.txt'
DATA_NEG = 'data/train_neg_full.txt'
VOCAB = 'tools/vocab.pkl'
TEST = 'data/test_data.txt'
    
PREDICTION_PATH = "CNN_pred_F%d_K%d_D%d_F%d_L%d_E%d_B%d_%s_c%s.csv" % (FILTERS, KERNEL_SIZE, DENSE_UNITS, FEATURE_SPACE, CNN_LAYERS, EPOCHS, BATCH_SIZE, ACTIVATION_FUNCTION, CLEAN)
MODEL_PATH = "CNN_model_F%d_K%d_D%d_F%d_L%d_E%d_B%d_%s_c%s.h5" % (FILTERS, KERNEL_SIZE, DENSE_UNITS, FEATURE_SPACE, CNN_LAYERS, EPOCHS, BATCH_SIZE, ACTIVATION_FUNCTION, CLEAN)
HISTORY_PATH = "CNN_hist_F%d_K%d_D%d_F%d_L%d_E%d_B%d_%s_c%s.pkl" % (FILTERS, KERNEL_SIZE, DENSE_UNITS, FEATURE_SPACE, CNN_LAYERS, EPOCHS, BATCH_SIZE, ACTIVATION_FUNCTION, CLEAN)

## If True, train the model on this notebook - else load model from pickle to generate predictions.
TRAIN_MODEL = True

# If True, use validation data
USE_VALID = True

## Convolutional Neural Network with TensorFlow

In [7]:
with open(VOCAB, 'rb') as f:
    vocab = pickle.load(f)

pos_tok = getTokens(DATA_POS, vocab)
neg_tok = getTokens(DATA_NEG, vocab)
test_data = getTokens(TEST, vocab)

In [10]:
## Prepare data
train_data, train_labels = preprocessData(pos_tok, neg_tok)

if USE_VALID:
    ## Split training and testing data into training and validation set
    train_data, validation_data, train_labels, validation_labels = train_test_split(train_data, train_labels)

In [13]:
max_length = len(max(train_data, key=len))

## Add padding for sequences
## Prepended to guarantee same input length
train_data = sequence.pad_sequences(train_data, maxlen=max_length)
test_data = sequence.pad_sequences(test_data, maxlen=max_length)

if USE_VALID:
    validation_data = sequence.pad_sequences(validation_data, maxlen=max_length)

In [14]:
## Build Model

## Embedding layer: Embedding is included as part of the NN instead of a pre-processing step
model = keras.Sequential()
model.add(keras.layers.Embedding(len(vocab), FEATURE_SPACE))

## Define activation functions we may use
FUNCS = {"sigmoid":tf.nn.sigmoid, "relu":tf.nn.relu}

## Add CNN-POOL-DENSE layer
for i in range(CNN_LAYERS):
    model.add(keras.layers.Convolution1D(filters = FILTERS, kernel_size = KERNEL_SIZE, activation=FUNCS[ACTIVATION_FUNCTION]))

model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(DENSE_UNITS, activation=FUNCS[ACTIVATION_FUNCTION]))
model.add(keras.layers.Dense(1, activation=FUNCS[ACTIVATION_FUNCTION]))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          634850    
_________________________________________________________________
conv1d (Conv1D)              (None, None, 30)          330       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 30)          930       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 30)          930       
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 30)          930       
_________________________________________________________________
global_average_pooling1d (Gl (None, 30)                0         
_________________________________________________________________
dense (Dense)                (None, 250)               7750      
__________

In [14]:
## Compile model
model.compile(optimizer=tf.train.AdamOptimizer(), loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
## Train data if necessary
if TRAIN_MODEL:
    history = model.fit(train_data, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE)
    model.save(MODEL_PATH)
    pickle.dump(history.history, HISTORY_PATH)
else:
    model = keras.models.load_model(MODEL_PATH)
    model.compile(optimizer=tf.train.AdamOptimizer(), loss='binary_crossentropy', metrics=['accuracy'])

Epoch 1/2
 137088/1875000 [=>............................] - ETA: 7:05 - loss: 0.6937 - acc: 0.5044

KeyboardInterrupt: 

In [None]:
# Accuracy
if USE_VALID:
    results = model.evaluate(validation_data, validation_labels)
    print(results)

In [None]:
## Generate predictions
y_pred = model.predict(test_data)

In [None]:
# Generate submission
y_pred[np.where(y_pred <= 0.5)] = 0
y_pred[np.where(y_pred > 0.5)] = 1
create_csv_submission(range(1,10001), y_pred, PREDICTION_PATH, False)
print("Predict at", PREDICTION_PATH)