## 5-2 Data Preprocessing for Tensorflow

In [7]:
import pandas as pd
import numpy as np
import re

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import json

## Prepro for Training Set

In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

In [3]:
DEFAULT_PATH = '~/.kaggle/competitions/quora-question-pairs/'

In [4]:
df_train = pd.read_csv(DEFAULT_PATH + 'train.csv', encoding='utf-8')
df_train = df_train.dropna() #drop empty

In [5]:
df_train.count()

id              404287
qid1            404287
qid2            404287
question1       404287
question2       404287
is_duplicate    404287
dtype: int64

In [11]:
FILTERS = "([~.,!?\"':;)(])"

changeFilter = re.compile(FILTERS)

questions = list(df_train['question1']) + list(df_train['question2'])
filtered_questions = list()
for q in questions:
     filtered_questions.append(re.sub(changeFilter, "", q).lower())

In [12]:
filtered_questions[0]

'what is the step by step guide to invest in share market in india'

In [19]:
df_train.count()

id              404287
qid1            404287
qid2            404287
question1       404287
question2       404287
is_duplicate    404287
dtype: int64

In [5]:
questions = list(df_train['question1']) + list(df_train['question2'])
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

question1_word_sequences = tokenizer.texts_to_sequences(df_train['question1'])
question2_word_sequences = tokenizer.texts_to_sequences(df_train['question2'])

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [6]:
question1_word_sequences[0]

[2, 3, 1, 1222, 57, 1222, 2581, 7, 576, 8, 763, 383, 8, 35]

In [7]:
is_duplicate = df_train['is_duplicate']

In [8]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [9]:
data_prepro_configs = {'vocab': tokenizer.word_index, 'vocab_size': len(tokenizer.word_counts)}

In [10]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)

In [11]:
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump(data_prepro_configs, f)

## Prepro for Test Set

In [22]:
TEST_Q1_DATA_FILE = 'q1_test.npy'
TEST_Q2_DATA_FILE = 'q2_test.npy'

In [23]:
df_test = pd.read_csv(DEFAULT_PATH + 'test.csv', encoding='utf-8')
df_test = df_test.dropna() #drop empty

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
df_test.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [25]:
test_question1_word_sequences = tokenizer.texts_to_sequences(df_test['question1'])
test_question2_word_sequences = tokenizer.texts_to_sequences(df_test['question2'])

In [26]:
test_q1_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [27]:
print('Shape of test question1 data tensor:', test_q1_data.shape)
print('Shape of test question2 data tensor:', test_q2_data.shape)

Shape of test question1 data tensor: (3563466, 25)
Shape of test question2 data tensor: (3563466, 25)


In [28]:
np.save(open(TEST_Q1_DATA_FILE, 'wb'), test_q1_data)
np.save(open(TEST_Q2_DATA_FILE, 'wb'), test_q2_data)