## Create same train/test sets as YoonKim


In [None]:
import pickle
import numpy as np


def get_idx_from_sent(sent, word_idx_map, max_l=51, k=300, filter_h=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = filter_h - 1
    for i in range(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data_cv(revs, word_idx_map, cv, max_l=51, k=300, filter_h=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, test = [], []
    for rev in revs:
        sent = get_idx_from_sent(rev["text"], word_idx_map, max_l, k, filter_h)   
        sent.append(rev["y"])
        if rev["split"] == cv:            
            test.append(sent)        
        else:  
            train.append(sent)   
    train = np.array(train, dtype="int")
    test = np.array(test, dtype="int")
    return [train, test] 

# revs  = [ {
#    num_words: number of words in sentence
#    split: which split of the cv
#    text: text to classify
#    y: 1 for positive, 0 for negative
#}]

# W = google pretrained word2vec matrix, each column represent a word
# W2 = random generated word2vec matrix, each column represent a word

# word_idx_map: word -> column number in word2vec matrix
# vocab = set of all words in dataset
mr = pickle.load(open("data/mr.p","rb"))
revs, W, W2, word_idx_map, vocab = mr[0], mr[1], mr[2], mr[3], mr[4]
print( "data loaded!" )

In [None]:
from sklearn import preprocessing
W3 = preprocessing.normalize(W, norm='l2')

In [None]:
import keras
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [None]:
from keras.layers import Input, Dense, Flatten, Embedding
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D, Input, Add, Dropout
from keras.layers.merge import Concatenate

np.random.seed(2017)

assert W.shape == W2.shape == W3.shape
vocab_size, vec_size = W.shape
sent_length = 64 # train_X.shape[1]

num_filters = 100
kernel_sizes = [3,4,5]
batch_size = 50
dropout_rate = 0.5
l2_constraint = 3

# TODO: check all layer initializers

def conv(x):
    cs = []
    for kernel_size in kernel_sizes:
        c = Conv1D( filters=num_filters, 
          kernel_size=kernel_size, 
          padding="valid", # i.e. no additional padding
          activation="relu",
          strides=1)(x)
        c = GlobalMaxPooling1D()(c)
        cs.append( c )
    return Concatenate()(cs)

def get_model( static=True, initial_embedding=W ):
    inputs = Input(shape=(64,))
    x = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=(not static) )(inputs)
    x = conv( x )
    predictions = Dense(1, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'] )
    return model

def multi_chnnel_conv( x1, x2 ):
    cs = []
    for kernel_size in kernel_sizes:
        shared_conv = Conv1D( filters=num_filters, 
          kernel_size=kernel_size, 
          padding="valid",
          activation="relu",
          strides=1)
        
        c1 = shared_conv( x1 )
        c2 = shared_conv( x2 )
        
        c1 = GlobalMaxPooling1D()(c1)
        c2 = GlobalMaxPooling1D()(c2)
        
        c = Add()( [c1, c2] )
        cs.append( c )
    return Concatenate()(cs)

def get_multi_channel_model( initial_embedding=W ):
    inputs = Input(shape=(64,))
    x_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=False )(inputs)
    
    x_non_static = Embedding( input_dim=vocab_size, 
              output_dim=vec_size, 
              input_length=sent_length, 
              weights=[ initial_embedding ], 
              trainable=True )(inputs)
    
    x = multi_chnnel_conv( x_static, x_non_static )
    predictions = Dense(1, kernel_initializer='normal', 
            #kernel_regularizer=regularizers.l2(3.), 
            activation='sigmoid')(x)
    model = Model( inputs=inputs, outputs=predictions )
    model.compile( optimizer='Adadelta',
              loss='binary_crossentropy',
              metrics=['binary_accuracy'] )
    return model

In [None]:
scores = []
for i in range(10):
    train, test = make_idx_data_cv(revs, word_idx_map, i, max_l=56, k=300, filter_h=5)
    
    # X is (,64) where 64 comes from pad + 56 + pad where pad=filter_h-1=5-1=4
    train_X, train_y = train[:,:-1], train[:,-1]
    test_X, test_y = test[:,:-1], test[:,-1]
    
    model = get_multi_channel_model( initial_embedding=W )
    model.fit( train_X, train_y, batch_size=50, epochs=25, validation_data=(test_X, test_y))
    score = model.evaluate( test_X, test_y )
    scores.append( score[1] )
print('final cross validation scores: {}'.format(sum(scores)/len(scores)))