## 5.2 Data Preprocessing for Tensorflow

In [1]:
import pandas as pd
import numpy as np
import re
import os
import random

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import json

  return f(*args, **kwds)


## Prepro for Training Set

In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 13 #TODO1: 길이 정리 필요

DATA_IN_PATH = './data_in/'
TRAIN_Q1_DATA = 'train_q1.npy'
TRAIN_Q2_DATA = 'train_q2.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
NB_WORDS_DATA = 'nb_words.json'

In [3]:
train_df = pd.read_csv(DATA_IN_PATH + 'train.csv', encoding='utf-8')
train_df = train_df.dropna() 

#### neg 데이터 Data balance 맞추기

In [6]:
train_pos_df = train_df.loc[train_df['is_duplicate'] == 1]
train_neg_df = train_df.loc[train_df['is_duplicate'] == 0]

class_diff_count = train_neg_df.shape[0] - train_pos_df.shape[0]
sample_frac = 1 - (class_diff_count / train_neg_df.shape[0])

train_neg_df = train_neg_df.sample(frac=sample_frac)

train_df = pd.concat([train_neg_df, train_pos_df])

In [7]:
print(len(train_neg_df['question1']))
print(len(train_pos_df['question1']))

149263
149263


In [8]:
train_df.count()

id              298526
qid1            298526
qid2            298526
question1       298526
question2       298526
is_duplicate    298526
dtype: int64

In [9]:
FILTERS = "([~.,!?\"':;)(])"

changeFilter = re.compile(FILTERS)

questions = list([str(s) for s in train_df['question1']]) + list([str(s) for s in train_df['question2']])
filtered_questions = list()
for q in questions:
     filtered_questions.append(re.sub(changeFilter, "", q).lower())

In [10]:
filtered_questions[0]

'parents what do you think about inclusive education by mainstreaming special education students in a regular classroom for part of or al of the day'

In [11]:
train_df.count()

id              298526
qid1            298526
qid2            298526
question1       298526
question2       298526
is_duplicate    298526
dtype: int64

In [12]:
questions = list(train_df['question1']) + list(train_df['question2'])
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

question1_word_sequences = tokenizer.texts_to_sequences(train_df['question1'])
question2_word_sequences = tokenizer.texts_to_sequences(train_df['question2'])

word_vocab = {}
word_vocab = tokenizer.word_index 
print("Words in index: {}".format(len(word_vocab)))

Words in index: 76882


In [13]:
question1_word_sequences[0]

[453,
 2,
 9,
 15,
 74,
 46,
 15171,
 449,
 57,
 34343,
 1053,
 449,
 289,
 8,
 7,
 1689,
 6000,
 14,
 552,
 10,
 26,
 2564,
 10,
 1,
 123]

In [26]:
is_duplicate = train_df['is_duplicate']

In [27]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicate, dtype=int)

print('Shape of question1 data tensor: {}'.format(q1_data.shape))
print('Shape of question2 data tensor:{}'.format(q2_data.shape))
print('Shape of label tensor: {}'.format(labels.shape))

Shape of question1 data tensor: (298526, 13)
Shape of question2 data tensor:(298526, 13)
Shape of label tensor: (298526,)


In [28]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) # vocab size 추가

In [29]:
if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)

np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA, 'wb'), q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), labels)

In [30]:
with open(DATA_IN_PATH + NB_WORDS_DATA, 'w') as f:
    json.dump(data_configs, f)

## Prepro for Test Set

In [38]:
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'

In [44]:
test_df = pd.read_csv(DATA_IN_PATH + 'test.csv', encoding='utf-8')
# test_df = test_df.drop_duplicates()

In [45]:
len(test_df)

2345796

In [36]:
test_df.head(5)

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [37]:
test_question1_word_sequences = tokenizer.texts_to_sequences([str(s) for s in list(test_df['question1'])])
test_question2_word_sequences = tokenizer.texts_to_sequences([str(s) for s in list(test_df['question2'])])

KeyboardInterrupt: 

In [None]:
test_q1_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
print('Shape of test question1 data tensor:', test_q1_data.shape)
print('Shape of test question2 data tensor:', test_q2_data.shape)

In [None]:
np.save(open(DATA_IN_PATH + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA, 'wb'), test_q2_data)