# Fiction utterance pairs: data preparation

## Import packages

In [1]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

Using TensorFlow backend.


## Initialize global variables

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
KERAS_DATASETS_DIR = expanduser('/content/gdrive/My Drive/tf2.0/')
# QUESTION_PAIRS_FILE_URL = 'http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv'
UTTER_PAIRS_FILE = 'labeled-utterance-pair-final-020519.csv'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
U1_TRAINING_DATA_FILE = 'u1_train.npy'
U2_TRAINING_DATA_FILE = 'u2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300

## Download and extract questions pairs data

In [0]:
# if not exists(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE):
#     get_file(QUESTION_PAIRS_FILE, QUESTION_PAIRS_FILE_URL)

# print("Processing", QUESTION_PAIRS_FILE)

# question1 = []
# question2 = []
# is_duplicate = []
# with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
#     reader = csv.DictReader(csvfile, delimiter='\t')
#     for row in reader:
#         print(row)
#         question1.append(row['utter_1'])
#         question2.append(row['utter_2'])
#         is_duplicate.append(row['label'])



In [0]:
import pandas as pd
my_csv = pd.read_csv(KERAS_DATASETS_DIR + UTTER_PAIRS_FILE, encoding='utf-8')
words1 = my_csv['utter_1'].tolist()
words2 = my_csv['utter_2'].tolist()

In [5]:
utter1 = [w.replace('\"', '') for w in words1]
utter2 = [w.replace('\"', '') for w in words2]
utter2

['But it is, [N] for Mrs Long has just been here, and she told       me all about it.',
 'Do you not want to know who has taken it? [N]',
 'You want to tell me, and I have no objection to hearing it.',
 'Why, my dear, you must know, Mrs Long says that Netherfield is taken by       a young man of large fortune from the north of England; that he came down       on Monday in a chaise and four to see the place, and was so much delighted       with it, that he agreed with Mr Morris immediately; that he is to take       possession before Michaelmas, and some of his servants are to be in the       house by the end of next week.',
 'What is his name?',
 'Bingley.',
 'Is he married or single?',
 'Oh! Single, my dear, to be sure! A single man of large fortune; four or       five thousand a year. What a fine thing for our girls!',
 'How so? How can it affect them?',
 'My dear Mr Bennet, [N] how can you be so tiresome! You       must know that I am thinking of his marrying one of them.',
 'Is that

In [6]:
print('Question pairs: %d' % len(utter1))

Question pairs: 1289


In [7]:
labels = my_csv['label'].tolist()
label2idx = {}
for label in np.unique(labels):
    label2idx[label] = len(label2idx)
label2idx


{'not_pair': 0, 'part': 1, 'response': 2}

In [0]:
y = [label2idx[label] for label in labels]

## Build tokenized word index

In [9]:
utterances = utter1 + utter2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(utterances)
utter1_word_sequences = tokenizer.texts_to_sequences(utter1)
utter2_word_sequences = tokenizer.texts_to_sequences(utter2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 4187


## Download and process GloVe embeddings

In [13]:
if (not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE)) or (not exists(KERAS_DATASETS_DIR + GLOVE_FILE)):
    print(KERAS_DATASETS_DIR + GLOVE_FILE)
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
    
print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

/content/gdrive/My Drive/tf2.0/glove.840B.300d.txt
Processing glove.840B.300d.txt
Word embeddings: 2196016


## Prepare word embedding matrix

In [14]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 367


## Prepare training data tensors

In [15]:
u1_data = pad_sequences(utter1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
u2_data = pad_sequences(utter2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(y, dtype=int)
print('Shape of question1 data tensor:', u1_data.shape)
print('Shape of question2 data tensor:', u2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (1289, 25)
Shape of question2 data tensor: (1289, 25)
Shape of label tensor: (1289,)


## Persist training and configuration data to files

In [0]:
import os
np.save(os.path.join(KERAS_DATASETS_DIR, U1_TRAINING_DATA_FILE), u1_data)
np.save(os.path.join(KERAS_DATASETS_DIR, U2_TRAINING_DATA_FILE), u2_data)
np.save(os.path.join(KERAS_DATASETS_DIR, LABEL_TRAINING_DATA_FILE), labels)
np.save(os.path.join(KERAS_DATASETS_DIR, WORD_EMBEDDING_MATRIX_FILE), word_embedding_matrix)
with open(os.path.join(KERAS_DATASETS_DIR, NB_WORDS_DATA_FILE), 'w') as f:
    json.dump({'nb_words': nb_words}, f)