# Additional Features

The goal of this notebook is to develop some additional features for the deep learning model, as well as generate the input files.

(note: each feature includes a unit test or two at the bottom, which can be modified to better understand the feature usage)

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import KeyedVectors
import re
from difflib import SequenceMatcher
import csv
import sys

from sklearn.feature_extraction.text import TfidfVectorizer

from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


Some text cleaning functions

In [4]:
LEMMATIZER = WordNetLemmatizer()
LEMMATIZED_WORDS = {}

LEMMATIZER_CACHE = {}

STOP_WORDS = set(stopwords.words('english'))

def quick_lemmatize(word, tag):
    global LEMMATIZER_CACHE
    wntag = penn_to_wn(tag)
    if wntag is None:
        return word
    word_tag_tuple = (word, wntag)
    if word_tag_tuple not in LEMMATIZER_CACHE:
        LEMMATIZER_CACHE[word_tag_tuple] = LEMMATIZER.lemmatize(word_tag_tuple[0], word_tag_tuple[1])
    return LEMMATIZER_CACHE[word_tag_tuple]

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return 'a'
    elif is_noun(tag):
        return 'n'
    elif is_adverb(tag):
        return 'r'
    elif is_verb(tag):
        return 'v'
    return None

def clean_text(text):
    if (type(text) != type('aa')):
        return ''
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = ' '.join([w for w in text.split(' ') if w not in STOP_WORDS])
    
    return text

def do_lemmatization(text):
    text = list(filter(lambda r: len(r) > 0 and r not in STOP_WORDS, text.split(' '))) #pos_tag requires list...
    pos_tagged = nltk.pos_tag(text)
    lemmatized = ' '.join(map(lambda w: quick_lemmatize(w[0], w[1]), pos_tagged))
    
    return lemmatized

print(clean_text('I am loving it!'))
print(do_lemmatization('loving'))

loving ! 
love


Question type feature. This feature looks for the words 'who', 'what', 'where', 'when', 'why', and 'how' in a question. The output is a bit vector with one dimension for each word; the field is 1 if the word is in the question, 0 otherwise. 

In [5]:
QUESTION_TYPES = ['who', 'what', 'where', 'when', 'why', 'how']

def get_question_type(text, postfix):
    if type(text) != type('aa') or len(text) == 0:
        text = ''
    text = re.sub(r"[^A-Za-z0-9^]", " ", text.lower())
    text = text.split(' ')
    data = {}
    column_format = 'is_{0}_question_{1}'
    for i in range(0, len(QUESTION_TYPES), 1):
        if len(text) > 0 and QUESTION_TYPES[i] in text:
            data[column_format.format(QUESTION_TYPES[i], postfix)] = 1.0
        else:
            data[column_format.format(QUESTION_TYPES[i], postfix)] = 0.0
        
    return pd.Series(data)

print(get_question_type('why do I use HDInsight?', '1'))

is_how_question_1      0.0
is_what_question_1     0.0
is_when_question_1     0.0
is_where_question_1    0.0
is_who_question_1      0.0
is_why_question_1      1.0
dtype: float64


Token IoU. Count the number of tokens that are the same in the words, and divide by the total distinct tokens in the words.

In [6]:
def intersection_over_union(text1, text2):
    text1_tokens = set(text1.split(' '))
    text2_tokens = set(text2.split(' '))
    
    intersect_count = len(text1_tokens.intersection(text2_tokens))
    union_count = len(text1_tokens.union(text2_tokens))
    
    return intersect_count / union_count

intersection_over_union('a b b c', 'b d c')

0.5

Longest common substring. Find the longest common token sequence in the text, divide by the minimum number of tokens in the two input sequences.

In [7]:
def longest_common_substring(text1, text2):
    text1_tokens = text1.split(' ')
    text2_tokens = text2.split(' ')
    mat = [[0 for i in range(0, len(text2_tokens), 1)] for j in range(0, len(text1_tokens))]
    max_substring_length = 0
    for i in range(0, len(text1_tokens), 1):
        for j in range(0, len(text2_tokens), 1):
            if text1_tokens[i] == text2_tokens[j]:
                if i > 0 and j > 0:
                    mat[i][j] = mat[i-1][j-1] + 1.0
                else:
                    mat[i][j] = 1.0
                if (mat[i][j] > max_substring_length):
                    max_substring_length = mat[i][j]
                    
    return max_substring_length / float(min(len(text1_tokens), len(text2_tokens)))

longest_common_substring('foo bar baz qux', 'bar baz cat')

0.6666666666666666

Question length. Number of tokens in the question.

In [8]:
def get_question_length(text):
    return len(text.split(' '))

get_question_length('foo bar baz')

3

Tf/Idf features.

The usage is to call clear_tfidf_model(), followed by compute_tf_idf_model() on the entire corpus. Then, calling get_tf_idf_sequence() on a sequence of tokens will return the tf/idf score for each token in the sequence.

In [9]:
TF_IDF_MODEL = None
def clear_tfidf_model():
    global TF_IDF_MODEL
    TF_IDF_MODEL = None

def compute_tf_idf_model(text):
    global TF_IDF_MODEL
    TF_IDF_MODEL = TfidfVectorizer(max_df=0.95, #min_df=5,
                                   max_features=200000,
                                   stop_words='english', use_idf=True)
    
    TF_IDF_MODEL.fit_transform(text)
    
def get_tf_idf_seq(texts, pad_length):
    global TF_IDF_MODEL
    seqs = []
    for text in texts:
        if type(text) != type('s'):
            text = ''
        tf_idf_scores = TF_IDF_MODEL.transform([text])
        seq = []
        tokens = text.split(' ')
        tokens = tokens[-1*pad_length:]
        for i in range(0, pad_length - len(tokens), 1):
            seq.append([0.0])
        for i in range(0, len(tokens), 1):
            word = tokens[i]
            if word in TF_IDF_MODEL.vocabulary_:
                seq.append([tf_idf_scores[0, TF_IDF_MODEL.vocabulary_[word]]])
            else:
                seq.append([0.0])
        seqs.append(seq)
    return np.array(seqs)

clear_tfidf_model()
compute_tf_idf_model(['This is a really nice wii you have', 'I want to buy a sandwich'])
print(get_tf_idf_seq(['wii a sandwich', 'astrology : capricorn sun cap moon cap rising say'], 10).shape)
        

(2, 10, 1)


This cell attempts to remove words not found in the Word2Vec model from the word index. It has limited success in its current form, not really contributing much to the model's accuracy. 

In [10]:
def reassign_word_index(word_index, embedding_model):
    new_word_index = {}
    new_index = 1
    for word in word_index:
        if word in embedding_model.vocab:
            new_word_index[word] = new_index
            new_index += 1
    return new_word_index

def make_sequences(texts1, texts2, word_index, maxlen):
    if len(texts1) != len(texts2):
        raise ValueError
    texts1_seq = []
    texts2_seq = []
    max_size = 0
    for i in range(0, len(texts1), 1):
        if type(texts1[i] != type('s')):
            texts1[i] = ''
        if type(texts2[i] != type('s')):
            texts2[i] = ''
        text1_tokens = texts1[i].split()
        text2_tokens = texts2[i].split()
        max_index = len(word_index) + 1
        additional_mappings = {}
        text1_seq = []
        text2_seq = []
        for i in range(0, maxlen-len(text1_tokens), 1):
            text1_seq.append(0)
            
        text1_tokens = text1_tokens[-1*maxlen:]
        for word in text1_tokens:
            if word in word_index:
                text1_seq.append(word_index[word])
            else:
                if word not in additional_mappings:
                    additional_mappings[word] = max_index
                    max_index += 1
                text1_seq.append(additional_mappings[word])

        for i in range(0, maxlen-len(text2_tokens), 1):
            text2_seq.append(0)

        text2_tokens = text2_tokens[-1*maxlen:]
        for word in text2_tokens:
            if word in word_index:
                text2_seq.append(word_index[word])
            else:
                if word not in additional_mappings:
                    additional_mappings[word] = max_index
                    max_index += 1
                text2_seq.append(additional_mappings[word])

        texts1_seq.append(text1_seq)
        texts2_seq.append(text2_seq)
        if max_index > max_size:
            max_size = max_index
        
    return np.array(texts1_seq), np.array(texts2_seq), max_size 


class dummy_embedding_model:
    def __init__(self):
        self.vocab = {'a', 'b', 'c'}

de = dummy_embedding_model()
wi = {'a': 1, 'd': 2, 'b': 3, 'c': 4, 'e': 5}
r = reassign_word_index(wi, de)
s1, s2, sz = make_sequences(['a b d e'], ['a c d f g'], r, 6)
print(r)
print(s1)
print(s2) 
print(sz)
print(type(s2))

{'a': 1, 'b': 2, 'c': 3}
[[0 0 0 0 0 0]]
[[0 0 0 0 0 0]]
4
<class 'numpy.ndarray'>


The start of the pipeline. Reads in the data.

The following cells compute all the features. They take some time to run. Go make a sandwich.

In [7]:
TRAIN_DATA_FILE = '../data/augmented.csv'
TEST_DATA_FILE = '../data/test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
TRAIN_PERCENTAGE = 0.90

print('Loading data and models...')
train_data = pd.read_csv('../data/augmented.csv')
test_data = pd.read_csv('../data/test.csv')
print(train_data.shape)

Loading data and models...
(417040, 6)


In [8]:
print('Determing question types...')
dummy = train_data.apply(lambda r: get_question_type(r['question1'], '1'), axis=1)
print(dummy.shape)
print(train_data.shape)
train_data = pd.concat([train_data, dummy], axis=1)
print(train_data.shape)
print(train_data.shape)
print('Processed q1 of train.')
dummy2 = train_data.apply(lambda r: get_question_type(r['question2'], '2'), axis=1)
train_data = pd.concat([train_data, dummy2], axis=1)
print(train_data.shape)
print('Processed q2 of train.')
dummy3 = test_data.apply(lambda r: get_question_type(r['question1'], '1'), axis=1)
test_data = pd.concat([test_data, dummy3], axis=1)
print('Processed q1 of test.')
dummy4 = test_data.apply(lambda r: get_question_type(r['question2'], '2'), axis=1)
test_data = pd.concat([test_data, dummy4], axis=1)
print('Processed q2 of test.')

Determing question types...
(417040, 6)
(417040, 6)
(417040, 12)
(417040, 12)
Processed q1 of train.
(417040, 18)
Processed q2 of train.
Processed q1 of test.
Processed q2 of test.


In [9]:
print('Cleaning q1 of train')
train_data['model_cleaned_question_1'] = train_data.apply(lambda r: clean_text(r['question1']), axis=1)
print(train_data.shape)
print('Lemmatizing q1 of train')
train_data['cleaned_question_1'] = train_data.apply(lambda r: do_lemmatization(r['model_cleaned_question_1']), axis=1)
print(train_data.shape)

print('Cleaning q2 of train')
train_data['model_cleaned_question_2'] = train_data.apply(lambda r: clean_text(r['question2']), axis=1)
print(train_data.shape)
print('Lemmatizing q2 of train')
train_data['cleaned_question_2'] = train_data.apply(lambda r: do_lemmatization(r['model_cleaned_question_2']), axis=1)
print(train_data.shape)

print('Cleaning q1 of test')
test_data['model_cleaned_question_1'] = test_data.apply(lambda r: clean_text(r['question1']), axis=1)
print('Lemmatizing q1 of test')
test_data['cleaned_question_1'] = test_data.apply(lambda r: do_lemmatization(r['model_cleaned_question_1']), axis=1)

print('Cleaning q2 of test')
test_data['model_cleaned_question_2'] = test_data.apply(lambda r: clean_text(r['question2']), axis=1)
print('Lemmatizing q2 of test')
test_data['cleaned_question_2'] = test_data.apply(lambda r: do_lemmatization(r['model_cleaned_question_2']), axis=1)

Cleaning q1 of train
(417040, 19)
Lemmatizing q1 of train
(417040, 20)
Cleaning q2 of train
(417040, 21)
Lemmatizing q2 of train
(417040, 22)
Cleaning q1 of test
Lemmatizing q1 of test
Cleaning q2 of test
Lemmatizing q2 of test


In [10]:
print('Computing Longest Common Substring scores...')
train_data['longest_common_substring'] = train_data.apply(lambda r: longest_common_substring(r['cleaned_question_1'], r['cleaned_question_2']), axis=1)
print(train_data.shape)
test_data['longest_common_substring'] = test_data.apply(lambda r: longest_common_substring(r['cleaned_question_1'], r['cleaned_question_2']), axis=1)

Computing Longest Common Substring scores...
(417040, 23)


In [11]:
print('Computing IoU scores...')
train_data['IoU'] = train_data.apply(lambda r: intersection_over_union(r['cleaned_question_1'], r['cleaned_question_2']), axis=1)
print(train_data.shape)
test_data['IoU'] = test_data.apply(lambda r: intersection_over_union(r['cleaned_question_1'], r['cleaned_question_2']), axis=1)

Computing IoU scores...
(417040, 24)


In [12]:
print('Computing question lengths...')
train_data['question_1_length'] = train_data.apply(lambda r: get_question_length(r['cleaned_question_1']), axis=1)
print(train_data.shape)
train_data['question_2_length'] = train_data.apply(lambda r: get_question_length(r['cleaned_question_2']), axis=1)
print(train_data.shape)
test_data['question_1_length'] = test_data.apply(lambda r: get_question_length(r['cleaned_question_1']), axis=1)
test_data['question_2_length'] = test_data.apply(lambda r: get_question_length(r['cleaned_question_2']), axis=1)
train_data.shape

Computing question lengths...
(417040, 25)
(417040, 26)


(417040, 26)

In [13]:
train_data.head(11)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,is_how_question_1,is_what_question_1,is_when_question_1,is_where_question_1,...,is_who_question_2,is_why_question_2,model_cleaned_question_1,cleaned_question_1,model_cleaned_question_2,cleaned_question_2,longest_common_substring,IoU,question_1_length,question_2_length
0,0,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1,0.0,1.0,0.0,0.0,...,0.0,0.0,astrology : capricorn sun cap moon cap rising say,astrology : capricorn sun cap moon cap rise say,triple capricorn sun moon ascendant capricorn say,triple capricorn sun moon ascendant capricorn say,0.222222,0.4,9,7
1,1,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,1.0,0.0,0.0,0.0,...,0.0,0.0,good geologist,good geologist,great geologist,great geologist,0.5,0.333333,2,2
2,2,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1,1.0,0.0,0.0,0.0,...,0.0,0.0,read find youtube comments,read find youtube comment,see youtube comments,see youtube comment,0.5,0.4,4,3
3,3,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1,0.0,1.0,0.0,0.0,...,0.0,0.0,make physics easy learn,make physic easy learn,make physics easy learn,make physic easy learn,1.0,1.0,4,4
4,4,25,114035,What can make Physics easy to learn?,Is there a way to make learning physics easier?,1,0.0,1.0,0.0,0.0,...,0.0,0.0,make physics easy learn,make physic easy learn,way make learning physics easier,way make learn physic easier,0.2,0.5,4,5
5,5,26,114035,How can you make physics easy to learn?,Is there a way to make learning physics easier?,1,1.0,0.0,0.0,0.0,...,0.0,0.0,make physics easy learn,make physic easy learn,way make learning physics easier,way make learn physic easier,0.2,0.5,4,5
6,6,27,28,What was your first sexual experience like?,What was your first sexual experience?,1,0.0,1.0,0.0,0.0,...,0.0,0.0,first sexual experience like,first sexual experience like,first sexual experience,first sexual experience,0.75,0.75,4,3
7,7,27,50277,What was your first sexual experience like?,What is your first sexual experience?,1,0.0,1.0,0.0,0.0,...,0.0,0.0,first sexual experience like,first sexual experience like,first sexual experience,first sexual experience,0.75,0.75,4,3
8,8,28,50277,What was your first sexual experience?,What is your first sexual experience?,1,0.0,1.0,0.0,0.0,...,0.0,0.0,first sexual experience,first sexual experience,first sexual experience,first sexual experience,1.0,1.0,3,3
9,9,6937,38502,How will Trump’s presidency affect international students in the US?,What does Trump’s victory mean for international students?,1,1.0,0.0,0.0,0.0,...,0.0,0.0,trump presidency affect international students us,trump presidency affect international student us,trump victory mean international students,trump victory mean international student,0.333333,0.375,6,5


Checkpoint the datset.

In [14]:
print(train_data.shape)
train_data.to_csv('../data/train_data_with_features.csv', index=False, quoting=csv.QUOTE_ALL)
test_data.to_csv('../data/test_data_with_features.csv', index=False, quoting=csv.QUOTE_ALL)

(417040, 26)


Prepare the deep learning input files. Convert the token sequences to one-hot vectors. Also save the tf/idf sequences.

In [17]:
train_data = pd.read_csv('../data/train_data_with_features.csv', index=False, quoting=csv.QUOTE_ALL)
test_data = pd.read_csv('../data/test_data_with_features.csv', index=False, quoting=csv.QUOTE_ALL)

EMBEDDING_FILE = '../models/GoogleNews-vectors-negative300.bin'
EMBEDDING_DIM = 300
TRAIN_DATA_FILE = '../data/augmented.csv'
TEST_DATA_FILE = '../data/test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
VALIDATION_SPLIT = 0.1

print('Loading word2vec model...')
sys.stdout.flush()
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

print('Tokenizing..')
sys.stdout.flush()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(\
    list(train_data['model_cleaned_question_1'])+\
    list(train_data['model_cleaned_question_2'])+\
    list(test_data['model_cleaned_question_1'])+\
    list(test_data['model_cleaned_question_2']))

sequences_1 = tokenizer.texts_to_sequences(list(train_data['model_cleaned_question_1']))
sequences_2 = tokenizer.texts_to_sequences(list(train_data['model_cleaned_question_2']))
test_sequences_1 = tokenizer.texts_to_sequences(list(test_data['model_cleaned_question_1']))
test_sequences_2 = tokenizer.texts_to_sequences(list(test_data['model_cleaned_question_2']))

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))
sys.stdout.flush()

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(train_data['is_duplicate'])
ids = np.array(train_data['id'])
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)
sys.stdout.flush()

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_data['test_id'])

print('Preparing embedding matrix')
sys.stdout.flush()

nb_words = min(MAX_NB_WORDS, len(word_index))+1

not_found_words = set()
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        not_found_words.add(word)
print('Null word embeddings: {0}'.format(len(not_found_words)))
sys.stdout.flush()    

with open('null_words.txt', 'w') as f:
    for word in not_found_words:
        f.write('{0}\n'.format(word))

    
question_1_feature_columns = [\
    'is_who_question_1',\
    'is_what_question_1',\
    'is_where_question_1',\
    'is_when_question_1',\
    'is_why_question_1',\
    'is_how_question_1',\
    'question_1_length'\
]

question_2_feature_columns = [\
    'is_who_question_2',\
    'is_what_question_2',\
    'is_where_question_2',\
    'is_when_question_2',\
    'is_why_question_2',\
    'is_how_question_2',\
    'question_2_length'
]

mutual_feature_columns = [\
    'longest_common_substring',\
    'IoU'\
]
    
print('Dividing up train/val sets')
sys.stdout.flush()
np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

question_1_train_features = np.vstack((np.array(train_data[question_1_feature_columns].ix[idx_train]), np.array(train_data[question_2_feature_columns].ix[idx_train])))
question_2_train_features = np.vstack((np.array(train_data[question_2_feature_columns].ix[idx_train]),np.array(train_data[question_1_feature_columns].ix[idx_train])))
mutual_train_features = np.vstack((np.array(train_data[mutual_feature_columns].ix[idx_train]), np.array(train_data[mutual_feature_columns].ix[idx_train])))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

ids_val_final = np.concatenate((ids[idx_val], ids[idx_val]))

question_1_val_features = np.vstack((np.array(train_data[question_1_feature_columns].ix[idx_val]), np.array(train_data[question_2_feature_columns].ix[idx_val])))
question_2_val_features = np.vstack((np.array(train_data[question_2_feature_columns].ix[idx_val]), np.array(train_data[question_1_feature_columns].ix[idx_val])))
mutual_val_features = np.vstack((np.array(train_data[mutual_feature_columns].ix[idx_val]), np.array(train_data[mutual_feature_columns].ix[idx_val])))

test_question_1_features = np.array(test_data[question_1_feature_columns])
test_question_2_features = np.array(test_data[question_2_feature_columns])
test_mutual_features = np.array(test_data[mutual_feature_columns])

print('Saving to binary files...')
sys.stdout.flush()
np.save('../scratch/data_1_train.npy', data_1_train)
np.save('../scratch/data_2_train.npy', data_2_train)
np.save('../scratch/labels_train.npy', labels_train)
np.save('../scratch/question_1_train_features.npy', question_1_train_features)
np.save('../scratch/question_2_train_features.npy', question_2_train_features)
np.save('../scratch/mutual_train_features.npy', mutual_train_features)

np.save('../scratch/data_1_val.npy', data_1_val)
np.save('../scratch/data_2_val.npy', data_2_val)
np.save('../scratch/labels_val.npy', labels_val)
np.save('../scratch/question_1_val_features.npy', question_1_val_features)
np.save('../scratch/question_2_val_features.npy', question_2_val_features)
np.save('../scratch/mutual_val_features.npy', mutual_val_features)

np.save('../scratch/ids_val.npy', ids_val_final)


np.save('../scratch/test_data_1.npy', test_data_1)
np.save('../scratch/test_data_2.npy', test_data_2)
np.save('../scratch/test_ids.npy', test_ids)
np.save('../scratch/test_question_1_features.npy', test_question_1_features)
np.save('../scratch/test_question_2_features.npy', test_question_2_features)
np.save('../scratch/test_mutual_features.npy', test_mutual_features)

np.save('../scratch/embedding_matrix.npy', embedding_matrix)

with open('../scratch/semaphore.txt', 'w') as f:
    f.write('a')

print('Done!')

Loading word2vec model...
Tokenizing..
Found 120376 unique tokens
Shape of data tensor: (417040, 30)
Shape of label tensor: (417040,)
Preparing embedding matrix
Null word embeddings: 61775
Dividing up train/val sets
Saving to binary files...
Done!


An alternative implementation that also saves tf/idf features and does out-of-vocabulary word removal

In [57]:
print('Reading features file...')
train_data = pd.read_csv('../data/train_data_with_features.csv', index_col=False, quoting=csv.QUOTE_ALL)
test_data = pd.read_csv('../data/test_data_with_features.csv', index_col=False, quoting=csv.QUOTE_ALL)

EMBEDDING_FILE = '../models/GoogleNews-vectors-negative300.bin'
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
VALIDATION_SPLIT = 0.1

print('Loading word2vec model...')
sys.stdout.flush()
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

print('Tokenizing..')
sys.stdout.flush()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
all_text = list(train_data['model_cleaned_question_1'])+\
    list(train_data['model_cleaned_question_2'])+\
    list(test_data['model_cleaned_question_1'])+\
    list(test_data['model_cleaned_question_2'])
all_text = list(map(lambda r: '' if type(r) != type('s') or len(r) < 1 else r, all_text))
tokenizer.fit_on_texts(all_text)

print('Crunching index...')
new_word_index = reassign_word_index(tokenizer.word_index, word2vec)

print('Generating sequences...')
train_sequences_1_f, train_sequences_2_f, sz1 = make_sequences(list(train_data['model_cleaned_question_1']), list(train_data['model_cleaned_question_2']), new_word_index, MAX_SEQUENCE_LENGTH)
train_sequences_1_b, train_sequences_2_b, sz2 = make_sequences(list(train_data['model_cleaned_question_2']), list(train_data['model_cleaned_question_1']), new_word_index, MAX_SEQUENCE_LENGTH)
test_sequences_1_f, test_sequences_2_f, sz3 = make_sequences(list(test_data['model_cleaned_question_1']), list(test_data['model_cleaned_question_2']), new_word_index, MAX_SEQUENCE_LENGTH)
test_sequences_1_b, test_sequences_2_b, sz4 = make_sequences(list(test_data['model_cleaned_question_1']), list(test_data['model_cleaned_question_2']), new_word_index, MAX_SEQUENCE_LENGTH)

print(train_sequences_1_f.shape)

print('Preparing embedding matrix')
sys.stdout.flush()

nb_extra_words = max(max(max(sz4, sz3), sz2), sz1) - len(new_word_index)
nb_std_words = len(new_word_index)

not_found_words = set()
embedding_matrix = np.zeros((nb_std_words + nb_extra_words, EMBEDDING_DIM + nb_extra_words))

for word, i in new_word_index.items():
    embedding_matrix[i, 0:300] = word2vec.word_vec(word)

for i in range(0, nb_extra_words, 1):
    embedding_matrix[i+nb_std_words, i+EMBEDDING_DIM] = 1.0

print(embedding_matrix.shape)
    
print('Training TF/IDF model')
clear_tfidf_model()
compute_tf_idf_model(all_text)

print('Generating tf/idf sequences')
tf_idf_train_sequences_1 = get_tf_idf_seq(list(train_data['model_cleaned_question_1']), MAX_SEQUENCE_LENGTH)
print('...')
tf_idf_train_sequences_2 = get_tf_idf_seq(list(train_data['model_cleaned_question_2']), MAX_SEQUENCE_LENGTH)
print('...')
tf_idf_test_sequences_1 = get_tf_idf_seq(list(test_data['model_cleaned_question_1']), MAX_SEQUENCE_LENGTH)
print('...')
tf_idf_test_sequences_2 = get_tf_idf_seq(list(test_data['model_cleaned_question_2']), MAX_SEQUENCE_LENGTH)
print('...')

print(tf_idf_train_sequences_1.shape)

labels = np.array(train_data['is_duplicate'])
ids = np.array(train_data['id'])
test_ids = np.array(test_data['test_id'])
    
print('Dividing up train/val sets')
sys.stdout.flush()
np.random.seed(1234)
perm = np.random.permutation(train_data.shape[0])
idx_train = perm[:int(train_data.shape[0]*(1-VALIDATION_SPLIT))]
idx_val = perm[int(train_data.shape[0]*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((train_sequences_1_f[idx_train], train_sequences_1_b[idx_train]))
data_2_train = np.vstack((train_sequences_2_f[idx_train], train_sequences_2_b[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
tfidf_1_train = np.vstack((tf_idf_train_sequences_1[idx_train], tf_idf_train_sequences_2[idx_train]))
tfidf_2_train = np.vstack((tf_idf_train_sequences_2[idx_train], tf_idf_train_sequences_1[idx_train]))

data_1_val = np.vstack((train_sequences_1_f[idx_val], train_sequences_1_b[idx_val]))
data_2_val = np.vstack((train_sequences_2_f[idx_val], train_sequences_2_b[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
tfidf_1_val = np.vstack((tf_idf_train_sequences_1[idx_val], tf_idf_train_sequences_2[idx_val]))
tfidf_2_val = np.vstack((tf_idf_train_sequences_2[idx_val], tf_idf_train_sequences_1[idx_val]))

ids_val_final = np.concatenate((ids[idx_val], ids[idx_val]))


print('Saving to binary files...')
sys.stdout.flush()
np.save('../scratch/data_1_train.npy', data_1_train)
np.save('../scratch/data_2_train.npy', data_2_train)
np.save('../scratch/labels_train.npy', labels_train)
np.save('../scratch/data_1_tfidf_train.npy', tfidf_1_train)
np.save('../scratch/data_2_tfidf_train.npy', tfidf_2_train)

np.save('../scratch/data_1_val.npy', data_1_val)
np.save('../scratch/data_2_val.npy', data_2_val)
np.save('../scratch/labels_val.npy', labels_val)
np.save('../scratch/data_1_val_tfidf.npy', tfidf_1_val)
np.save('../scratch/data_2_val_tfidf.npy', tfidf_2_val)

np.save('../scratch/ids_val.npy', ids_val_final)


np.save('../scratch/test_data_1.npy', test_sequences_1_f)
np.save('../scratch/test_data_2.npy', test_sequences_2_f)
np.save('../scratch/test_ids.npy', test_ids)
np.save('../scratch/test_data_tfidf_1.npy', tf_idf_test_sequences_1)
np.save('../scratch/test_data_tfidf_2.npy', tf_idf_test_sequences_2)

np.save('../scratch/embedding_matrix.npy', embedding_matrix)

with open('../scratch/semaphore.txt', 'w') as f:
    f.write('a')

print('Done!')

Reading features file...
Loading word2vec model...
Tokenizing..
Crunching index...
Generating sequences...
(417040, 30)
Preparing embedding matrix
(58602, 301)
Training TF/IDF model
Generating tf/idf sequences
...
...
...
...
(417040, 30, 1)
Dividing up train/val sets
Saving to binary files...
Done!
