In [1]:
import keras

Using TensorFlow backend.


In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt 
%matplotlib inline

from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from gensim.models import KeyedVectors
import gensim.downloader as api

In [5]:
#Import the data 
column_names = ['qid', 'question_text', 'target']
df = pd.read_csv('./train.csv', names=column_names, header=None, na_values='?', low_memory=False)

In [6]:
path_to_embeddings = "./GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
embeddings_index = KeyedVectors.load_word2vec_format(path_to_embeddings, binary=True)

In [7]:
Y = np.array(df['target'][1:2001])
X = np.array(df['question_text'][1:2001])

X_words, X_test_words, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [21]:
df.head(6)
print(X_words.shape)
print(X_test_words.shape)
df.shape

(1500,)
(500,)


(1306123, 3)

In [9]:
def fix_matrix(resulting_array, max_row_length): 
    if resulting_array.size == 0: 
        resulting_array = np.reshape(resulting_array, (0, max_row_length))
    else: 
        diff = max_row_length - resulting_array.shape[1]
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        
        resulting_array = np.pad(resulting_array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return resulting_array

In [10]:
def fix_row_length(array, max_row_length):
    if len(array.shape) < 2:
        array_len = array.shape[0] #array has to be a numpy array
    else: 
        array_len = array.shape[1] #array has to be a numpy array
    
    
    if array_len < max_row_length: 
        diff = max_row_length - array_len
        if diff % 2 == 1: 
            left = diff // 2 + 1 
        else: 
            left = diff // 2 
        right = diff // 2 
        if len(array.shape) < 2:
            array = np.pad(array, (left,right), 'constant', constant_values=(0,0))
        else:
            array = np.pad(array, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    return array

In [11]:
# extract the embeddings
# create a vector with the embeddings values 
# put all the embeddings together to create a row in the matrix 
# check if the lenghth of the concatenated embeddings array is bigger than the current length of each row in np.array
# if it's shorter, pass it on to the fix_length function

def vectorize_questions(words):
    max_row_length = 0
    resulting_array = np.array([])

    for sentence in words: 
        stripped_sentence = sentence.replace("?", "")
        words = stripped_sentence.split()
        sentence_vector = []
        for word in words: 
            if word in embeddings_index: 
                sentence_vector.extend(embeddings_index[word])

        #turn it into a np_array
        sentence_vector = np.array(sentence_vector)
        sentence_vector = np.reshape(sentence_vector, (1, sentence_vector.shape[0]))

        if sentence_vector.shape[1] > max_row_length: 
            max_row_length = sentence_vector.shape[1]
            resulting_array = fix_matrix(resulting_array, max_row_length)
        elif sentence_vector.shape[1] < max_row_length:     
            sentence_vector=fix_row_length(sentence_vector, max_row_length)

        resulting_array = np.append(resulting_array, sentence_vector, axis=0)
    return resulting_array


In [12]:
#Pre-process training set and test set
X_train = vectorize_questions(X_words)

In [13]:
X_test = vectorize_questions(X_test_words)

In [14]:
print(X_train.shape)
print(X_test.shape)

#fix shape difference

diff = abs(X_train.shape[1] - X_test.shape[1])
if diff % 2 == 1: 
    left = diff // 2 + 1 
else: 
    left = diff // 2 
right = diff // 2 

if (X_train.shape < X_test.shape): 
    X_train = np.pad(X_train, ((0,0),(left,right)), 'constant', constant_values=(0,0))
else:
    X_test = np.pad(X_test, ((0,0),(left,right)), 'constant', constant_values=(0,0))
    
print(X_train.shape)
print(X_test.shape)

(1500, 13200)
(500, 11400)
(1500, 13200)
(500, 13200)


In [15]:
import keras.backend as K 
K.clear_session()

In [16]:
input_shape = X_train.shape
nin = X_train.shape[1]
nh = 100
nout = 2
model = Sequential()
# model.add(Conv1D(32, kernel_size=(3,), strides=(1, ), activation='relu', input_shape=input_shape))
# model.add(MaxPooling1D(pool_size=(2, ), strides=(2, )))
# model.add(Conv1D(64, (3,), activation='relu'))
# model.add(MaxPooling1D(pool_size=(2, )))
# model.add(Conv1D(64, (3,), activation='relu'))
# model.add(Conv1D(64, (4,), activation='relu'))
# model.add(MaxPooling1D(pool_size=(2, )))
# model.add(Flatten())
# model.add(Dense(2000, activation='relu'))
# model.add(Dense(1000, activation='relu'))
# model.add(Dense(500, activation='relu'))
# # model.add(Dense(num_classes, activation='softmax'))




model.add(Dense(nh, input_shape=(nin,), activation='sigmoid', name='hiden'))
model.add(Dense(nout, activation='softmax', name='output'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hiden (Dense)                (None, 100)               1320100   
_________________________________________________________________
output (Dense)               (None, 2)                 202       
Total params: 1,320,302
Trainable params: 1,320,302
Non-trainable params: 0
_________________________________________________________________


In [17]:
from keras import optimizers

opt = optimizers.Adam(lr=0.001) # x_1=0.9, x_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.fit(X_train, Y_train, epochs=10, batch_size=100, validation_data=(X_test,Y_test))

Train on 1500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b1cee6b45c0>

In [19]:
yhat = model.predict(X_test)

In [20]:
score, acc = model.evaluate(X_test, Y_test, verbose=0)
print("accuracy = %f" % acc)

accuracy = 0.938000
