In [20]:
import pandas as pd
import numpy as np

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

from sklearn.model_selection  import train_test_split

In [28]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
DEFAULT_PATH = '~/.kaggle/competitions/quora-question-pairs/'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'

TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'

df_train = pd.read_csv(DEFAULT_PATH + 'train.csv', encoding='utf-8')

In [22]:
DATA_IN_PATH = expanduser('~/.kaggle/competitions/quora-question-pairs/')
train_data = os.path.join(DATA_IN_PATH, 'train.csv')
test_data = os.path.join(DATA_IN_PATH, 'test.csv')

In [23]:
train_data = pd.read_csv(train_data, encoding='utf-8')
train_data = train_data.dropna() #drop empty

In [24]:
train_questions = list(train_data['question1']) + list(train_data['question2'])
train_tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
train_tokenizer.fit_on_texts(train_questions)
word_index = train_tokenizer.word_index

train_question1_word_sequences = train_tokenizer.texts_to_sequences(train_data.question1)
train_question2_word_sequences = train_tokenizer.texts_to_sequences(train_data.question2)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [25]:
is_duplicateTrain = train_data.is_duplicate

In [26]:
train_q1_data = pad_sequences(train_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
train_q2_data = pad_sequences(train_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicateTrain, dtype=int)
print('Shape of question1 data tensor:', train_q1_data.shape)
print('Shape of question2 data tensor:', train_q2_data.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)


In [29]:
np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'wb'), train_q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'wb'), train_q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'wb'), labels)

In [30]:
test_df = pd.read_csv(test_data, encoding='utf-8')
test_df = test_df.dropna() #drop empty

In [31]:
test_questions = list(test_df['question1']) + list(test_df['question2'])
test_tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
test_tokenizer.fit_on_texts(test_questions)
word_index = test_tokenizer.word_index

test_question1_word_sequences = test_tokenizer.texts_to_sequences(test_df.question1)
test_question2_word_sequences = test_tokenizer.texts_to_sequences(test_df.question2)

print("Words in index: %d" % len(word_index))

Words in index: 101312


In [32]:
test_q1_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Shape of question1 data tensor:', test_q1_data.shape)
print('Shape of question2 data tensor:', test_q2_data.shape)

Shape of question1 data tensor: (2345790, 25)
Shape of question2 data tensor: (2345790, 25)


In [33]:
np.save(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'wb'), test_q2_data)