In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

########################################
## set directories and parameters
########################################
BASE_DIR = 'data/'
EMBEDDING_FILE = BASE_DIR + 'cache/GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 35
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 200#np.random.randint(175, 275)
num_dense = 200 #np.random.randint(100, 150)
rate_drop_lstm = 0.2 #0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.1 #0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)


Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5103)


In [2]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

In [3]:
Q1_TRAINING_DATA_FILE = 'data/cache/q1_train.npy'
Q2_TRAINING_DATA_FILE = 'data/cache/q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'data/cache/label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'data/cache/word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'data/cache/nb_words.json'
Q1_TESTING_DATA_FILE = 'q1_test.npy'
Q2_TESTING_DATA_FILE = 'q2_test.npy'


In [4]:
df_train_cleaned = pd.read_csv("data/train_clean_vB.csv")
df_test_cleaned = pd.read_csv('data/test_clean_vB.csv')


In [5]:
texts_1 = df_train_cleaned.q1_clean_vB.tolist()
texts_2 = df_train_cleaned.q2_clean_vB.tolist()


In [11]:
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))


In [6]:
test_texts_1 = df_test_cleaned.q1_clean_vB.tolist()
test_texts_2 = df_test_cleaned.q2_clean_vB.tolist()

print('Found %s texts in test.csv' % len(test_texts_1))

In [7]:
test_ids  = np.array(df_test_cleaned['test_id'])

In [12]:
# %%time 

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)



In [13]:
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

In [23]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1
unknown_words = []
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        unknown_words.append(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


In [24]:
np.sum(np.sum(embedding_matrix, axis=1) == 0)

62104

In [25]:
unknown_words

['murwara',
 'daiict',
 'utnapishtim',
 'sonja',
 'devonians',
 'hsv',
 'fawx',
 'paiza',
 'turram',
 'lottoland',
 '5987',
 'chanthaburi',
 'googlenet',
 'shadowdancer',
 'rickman',
 'canem',
 'suggesion',
 'ecsta',
 'tourister',
 'kenpachi',
 'zandikhohlisa',
 'gazatted',
 'freesteamgifts',
 'broward',
 'naturopathix',
 'jrpg',
 'throwaround',
 'kaushal',
 'vologda',
 'sericin',
 'ctk3200',
 '330ml',
 'girokonto',
 'thrace',
 'targu',
 '0051',
 '270',
 '271',
 '272',
 '273',
 '274',
 '275',
 '276',
 '277',
 '278',
 '279',
 'starboy',
 'krampusnacht',
 'naache',
 '4cell',
 '2300mhz',
 'movietv',
 'trivikram',
 '27i',
 '10700',
 'bhubaneshwar',
 'oct14',
 't250si',
 't250sl',
 'motorolla',
 'phillipine',
 '1670s',
 'controverse',
 '19395',
 'imporatnace',
 'istributed',
 'diomand',
 'mclaurin',
 'yaghan',
 'sterk',
 'ohhio',
 'inlisted',
 'sneha',
 'issual',
 'dne',
 'dnd',
 'thamirabarani',
 'dnn',
 'jspatch',
 'lme',
 'pgdhrm',
 'ronalds',
 'unlearnable',
 'twenty19',
 'kayasthas',
 

In [17]:
path = '/home/ubuntu/quora/'
data_home = path +"data/"

In [18]:
Q1_TRAINING_DATA_FILE = 'q1_train_google.npy'
Q2_TRAINING_DATA_FILE = 'q2_train_google.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix_google.npy'
NB_WORDS_DATA_FILE = 'nb_words_google.json'
Q1_TESTING_DATA_FILE = 'q1_test_google.npy'
Q2_TESTING_DATA_FILE = 'q2_test_google.npy'

In [19]:
import json

In [21]:
np.save(open(data_home+"cache/"+Q1_TRAINING_DATA_FILE, 'wb'), data_1)
np.save(open(data_home+"cache/"+Q2_TRAINING_DATA_FILE, 'wb'), data_2)
# np.save(open(data_home+"cache/"+LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(data_home+"cache/"+WORD_EMBEDDING_MATRIX_FILE, 'wb'), embedding_matrix)

with open(data_home+"cache/"+NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)

In [22]:
np.save(open(data_home+"cache/"+Q1_TESTING_DATA_FILE, 'wb'), test_data_1)
np.save(open(data_home+"cache/"+Q2_TESTING_DATA_FILE, 'wb'), test_data_2)