# Quora question pairs: data preparation

## Import packages

In [4]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file



import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm


import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



import spacy

import utils 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from __future__ import absolute_import
from __future__ import print_function
from IPython.lib.display import FileLink


import pickle 

## Initialize global variables

In [5]:
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/')
QUESTION_PAIRS_FILE_URL = 'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
QUESTION_PAIRS_FILE = 'quora_duplicate_questions.tsv'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300

## Download and extract questions pairs data

In [6]:

path = '/home/ubuntu/quora/'
data_home = path +"data/"

In [22]:
df_train = pd.read_csv(data_home+'train.csv' , encoding='utf-8'   )


In [23]:
df_train = df_train.fillna("empty")
df_train['question1'] = df_train['question1'].apply(lambda x: x.encode('utf-8'))
df_train['question2'] = df_train['question2'].apply(lambda x: x.encode('utf-8'))
question1 = df_train.question1
question2 = df_train.question2
is_duplicate = df_train.is_duplicate

In [24]:

print('Question pairs: %d' % len(question1))

Question pairs: 404290


## Build tokenized word index

In [26]:
str(questions[10])

'Method to find separation of slits using fresnel biprism?What are some of the things technicians can tell about the durability and reliability of Laptops and its components?'

In [27]:
questions = question1 + question2
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 96500


In [42]:
# for i in question1_word_sequences[0]:
#     print (tokenizer.word_docs[i])
question1_word_sequences[0:4]

[[2, 3, 1, 1223, 57, 1223, 2584, 7, 576, 8, 764, 383, 8, 35],
 [2, 3, 1, 559, 10, 14608, 13672, 5, 21817, 4572],
 [4, 13, 5, 217, 1, 440, 10, 17, 361, 1828, 200, 146, 6, 2778],
 [16, 72, 5, 2780, 312, 2762, 4, 13, 5, 649, 19]]

In [43]:
question1[0:4]

0    What is the step by step guide to invest in sh...
1    What is the story of Kohinoor (Koh-i-Noor) Dia...
2    How can I increase the speed of my internet co...
3    Why am I mentally very lonely? How can I solve...
Name: question1, dtype: object

## Download and process GloVe embeddings

In [44]:
if not exists(data_home+"cache/" + GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
    
print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Downloading data from http://nlp.stanford.edu/data/glove.840B.300d.zip
 244654080/2176768927 [==>...........................] - ETA: 170s

IOError: [Errno 28] No space left on device

## Prepare word embedding matrix

In [6]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 29273


## Prepare training data tensors

In [7]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404351, 25)
Shape of question2 data tensor: (404351, 25)
Shape of label tensor: (404351,)


## Persist training and configuration data to files

In [8]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)