## 5.2 Data Preprocessing for Tensorflow

In [None]:
import pandas as pd
import numpy as np
import re
import os
import random

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import json

## Prepro for Training Set

In [None]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 40

DATA_IN_PATH = './data_in/'
TRAIN_Q1_DATA = 'train_q1.npy'
TRAIN_Q2_DATA = 'train_q2.npy'
TRAIN_LABEL_DATA = 'train_label.npy'
NB_WORDS_DATA = 'nb_words.json'

In [None]:
train_df = pd.read_csv(DATA_IN_PATH + 'train.csv', encoding='utf-8')
train_df = train_df.dropna() 

#### neg 데이터 Data balance 맞추기

In [None]:
train_pos_df = train_df.loc[train_df['is_duplicate'] == 1]
train_neg_df = train_df.loc[train_df['is_duplicate'] == 0]

class_diff_count = train_neg_df.shape[0] - train_pos_df.shape[0]
sample_frac = 1 - (class_diff_count / train_neg_df.shape[0])

train_neg_df = train_neg_df.sample(frac=sample_frac)

train_df = pd.concat([train_neg_df, train_pos_df])

In [None]:
print(len(train_neg_df['question1']))
print(len(train_pos_df['question1']))

In [None]:
train_df.count()

In [None]:
FILTERS = "([~.,!?\"':;)(])"

changeFilter = re.compile(FILTERS)

questions = list([str(s) for s in train_df['question1']]) + list([str(s) for s in train_df['question2']])
filtered_questions = list()
for q in questions:
     filtered_questions.append(re.sub(changeFilter, "", q).lower())

In [None]:
filtered_questions[0]

In [None]:
train_df.count()

In [None]:
questions = list(train_df['question1']) + list(train_df['question2'])
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

question1_word_sequences = tokenizer.texts_to_sequences(train_df['question1'])
question2_word_sequences = tokenizer.texts_to_sequences(train_df['question2'])

word_vocab = {}
word_vocab = tokenizer.word_index 
print("Words in index: {}".format(len(word_vocab)))

In [None]:
question1_word_sequences[0]

In [None]:
is_duplicate = train_df['is_duplicate']

In [None]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicate, dtype=int)

print('Shape of question1 data tensor: {}'.format(q1_data.shape))
print('Shape of question2 data tensor:{}'.format(q2_data.shape))
print('Shape of label tensor: {}'.format(labels.shape))

In [None]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) # vocab size 추가

In [None]:
if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)

np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA, 'wb'), q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), labels)

In [None]:
with open(DATA_IN_PATH + NB_WORDS_DATA, 'w') as f:
    json.dump(data_configs, f)

## Prepro for Test Set

In [None]:
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'

In [None]:
test_df = pd.read_csv(DATA_IN_PATH + 'test.csv', encoding='utf-8')
test_df = test_df.drop_duplicates()

In [None]:
test_df.head(5)

In [None]:
test_question1_word_sequences = tokenizer.texts_to_sequences([str(s) for s in list(test_df['question1'])])
test_question2_word_sequences = tokenizer.texts_to_sequences([str(s) for s in list(test_df['question2'])])

In [None]:
test_q1_data = pad_sequences(test_question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
print('Shape of test question1 data tensor:', test_q1_data.shape)
print('Shape of test question2 data tensor:', test_q2_data.shape)

In [None]:
np.save(open(DATA_IN_PATH + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA, 'wb'), test_q2_data)