In [1]:
import candidates_listing as cl
import findDocuments as fd
import json
import numpy as np
import os
import preprocessing as prep
import rnn_compare_twotext as rc
import sentence_vectorization as sv

from keras.layers import Input
from keras.models import Model
from pprint import pprint

Using TensorFlow backend.


In [90]:
def get_original_token_positions(document_id, documents_path):
    doc_path = os.path.join(documents_path, str(document_id) + '.json')
    with open(doc_path, 'r', encoding='utf-8', errors='ignore') as f:
        document = json.load(f)
        preprocessed_document = prep.remove_noise(document)
    
    return preprocessed_document[1], preprocessed_document[2]

def get_sentence_vectorization_layer(model, idx=5):
    from keras.models import Model

    vectorization_layer = Model(input=model.input, output=model.get_layer(index=idx).output)
    
    return vectorization_layer

def load_json_doc_ids(path='D:/Users/Patdanai/Workspace/th-qa-system-261491/data/'):
    with open(path, 'r') as f:
        candidate_ids = json.load(f)
    
    return candidate_ids
    
def load_corpus_word_vectors(path='D:/Users/Patdanai/th-qasys-db/word_vectors_model/word2vec.model'):
    from gensim.models import Word2Vec
    wv_model = Word2Vec.load(path)
    return wv_model.wv

def load_document_word_vectors(document_ids, wv_path='D:/Users/Patdanai/th-qasys-db/preprocessed_corpus_wv'):
    return np.load(wv_path + str(document_ids) + '.npy')

def load_document_tokens(document_id, wv_path='D:/Users/Patdanai/th-qasys-db/preprocessed_corpus_wv'):
    with open(wv_path + document_id + '.json', encoding='utf-8', errors='ignore') as f:
        data = json.load(f)
    return data

def load_sentence_vectorization_model(model_path):
    from keras.models import load_model

    model = load_model(model_path)
    model.summary()
    
    return model

def load_tokenized_questions(path):
    with open(path, 'r') as f:
        questions = json.load(f)
    
    return questions, len(questions)

def calculate_distance(vectorized_question, vectorized_sentences):
    distance_matrices = []
    for i in range(vectorized_sentences.__len__()):
        distance_matrix = np.zeros((vectorized_sentences[i].shape[0], ))
        for j in range(vectorized_sentences[i].__len__()):
            distance_matrix[j] = np.linalg.norm(candidate_sentence_vectors[i][j] - vectorized_question)
        distance_matrices.append(distance_matrix)

    return distance_matrices

def sort_distances(distance_matrices, max_num_candidate=8):
    min_distance_indexes = []
    ordered_distance_matrices = []
    for i in range(distance_matrices.__len__()):
        # argsort()[:len(distance_matrix[i])] => ascending order ranking from 0 (or 1) to len(distance_matrix[i])
        if(distance_matrices[i].__len__() < max_num_candidate):
            min_index = np.asarray(distance_matrices[i]).argsort()[:distance_matrices[i].__len__()]
            sorted_dist = np.sort(distance_matrices[i])[:distance_matrices[i].__len__()]
        else:
            min_index = np.asarray(distance_matrices[i]).argsort()[:max_num_candidate]
            sorted_dist = np.sort(distance_matrices[i])[:max_num_candidate]
        ordered_distance_matrices.append(sorted_dist)
        min_distance_indexes.append(min_index)
    
    min_distance_indexes = np.asarray(min_distance_indexes)
    return min_distance_indexes, ordered_distance_matrices

def locate_plain_text_characters(sentence_ranges, min_distance_indexes, original_tokens_ranges):
    plain_text_character_positions = []
    sentence_indexes = []
    for i in range(min_distance_indexes.__len__()):
        temp_all_sentences = []
        temp = []
        for j in range(min_distance_indexes[i].__len__()):
            temp_one_sentence = []
            min_dist_idx = min_distance_indexes[i][j]
            sentence_range = sentence_ranges[i][min_dist_idx] # tuple of candidate sentence range
            for k in range(sentence_range[0], sentence_range[1]):
                try:
                    character_position = original_tokens_ranges[i][k]
                except:
                    character_position = original_tokens_ranges[i][-1]
                temp_one_sentence.append(character_position)
            temp_all_sentences.append(temp_one_sentence)
            temp.append(sentence_range)
        plain_text_character_positions.append(temp_all_sentences)
        sentence_indexes.append(temp)
    return plain_text_character_positions, sentence_indexes

def locate_candidate_answers(vectorized_question, vectorized_sentences, sentence_ranges, 
                                original_tokens_ranges, max_num_candidate=8):
    distance_matrix = calculate_distance(vectorized_question, vectorized_sentences)
    min_distance_indexes, min_distance_matrix = sort_distances(distance_matrix, max_num_candidate=max_num_candidate)
    plaint_text_character_positions, sentence_indexes = locate_plain_text_characters(sentence_ranges, min_distance_indexes, original_tokens_ranges)

    return plaint_text_character_positions, min_distance_matrix, sentence_indexes

def vectorize_question_tokens(tokenized_question, word_vectors, embedded_question=[], embedding_shape=(100, ), words_per_sentence=20):
    # for i in range(tokenized_questions.__len__()):
    for j in range(tokenized_question.__len__()): # for word in tokenized question
        try:
            embedded_token = word_vectors[tokenized_question[j]]
            embedded_question.append(embedded_token)
        except:
            embedded_question.append(np.zeros(embedding_shape))
    while(embedded_question.__len__() < words_per_sentence):
        embedded_question.insert(0, np.zeros(embedding_shape))
        print(embedded_question.__len__())
    while(embedded_question.__len__() > words_per_sentence):
        embedded_question = embedded_question[:words_per_sentence]

    return np.asarray(embedded_question)

In [None]:
DOCUMENTS_PATH = 'D:/Users/Patdanai/th-qasys-db/tokenized_wiki_corpus/'
MODEL_PATH = '../models/compare_model_v3/191-0.7933.h5'
SV_PATH = 'D:/Users/Patdanai/th-qasys-db/corpus_sv/'
WV_PATH = 'D:/Users/Patdanai/th-qasys-db/preprocessed_corpus_wv/'
WV_MODEL_PATH = 'D:/Users/Patdanai/th-qasys-db/word_vectors_model/word2vec.model'
questions_path = '../results/question_sentence_tokens/question_sentence_tokens.json'
# questions_path = 'C:/Users/Patdanai/Workspace/th-qa-system-261491/data/ThaiQACorpus-EvaluationDataset-tokenize.json'
# questions_path = 'C:/Users/Patdanai/Workspace/th-qa-system-261491/data/new_sample_questions_tokenize.json'

WORDS_PER_SENTENCE = 20
OVERLAPPING_WORDS = WORDS_PER_SENTENCE // 2

word_vectors = load_corpus_word_vectors(path=WV_MODEL_PATH)

qsv_model = rc.sentenceVector()
qsv_model.load_weights(MODEL_PATH, by_name=True)
# print(qsv_model.summary())

question_wv_seq = Input(shape=(rc.sentence_length, rc.word_vector_length,))
vectorized_qs = qsv_model(question_wv_seq)
candidate_sentence_seq = Input(shape=(rc.rnn_size,))

sc_model = rc.sentenceCompare()
sc_model.load_weights(MODEL_PATH, by_name=True)
# print(sc_model.summary())

similarity = sc_model([candidate_sentence_seq, vectorized_qs])

model = Model(inputs=[candidate_sentence_seq, question_wv_seq], outputs=similarity)
print(model.summary())

In [85]:
tokenized_questions, questions_num = load_tokenized_questions(questions_path) # use this question num
print(np.array(tokenized_questions).shape, questions_num)

begin_question = 0

with open('../data/test_doc_ids.json', 'r') as f:
    candidate_document_ids = json.load(f)

candidate_answers = []
part = 0
print('Part: %d' % part)

(4000,) 4000
Part: 0


In [89]:
for i in range(0, candidate_document_ids.__len__()): # question
        print('Processing question [' + str(i) + '/' + str(candidate_document_ids.__len__()) + '] candidate documents. \r')
        documents_index = [] # original one
        documents_lengths = [] # original one
        array_of_wvs = []
        tokenized_docs = []
        for j in range(candidate_document_ids[i].__len__()): # candidate doc
            original_index, original_lengths = get_original_token_positions(candidate_document_ids[i][j], DOCUMENTS_PATH)
            array_of_wvs.append(np.load(SV_PATH + 's-' + str(candidate_document_ids[i][j]) + '.npy'))
            documents_index.append(original_index)
            documents_lengths.append(original_lengths)
            tokenized_docs.append(load_document_word_vectors(candidate_document_ids[i][j], WV_PATH))
        array_of_wvs = np.array(array_of_wvs)
        print(array_of_wvs.shape)
        
        m_words = prep.m_words_separate(WORDS_PER_SENTENCE, tokenized_docs, overlapping_words=WORDS_PER_SENTENCE//2)
        m_words_sentences = m_words[0]
        m_words_index_ranges = m_words[1]
        
        print(m_words_index_ranges)

Processing question [0/1] candidate documents. 
(1, 5, 16)
[[[ 0 20]
  [10 30]
  [20 40]
  [30 50]
  [40 60]
  [50 70]
  [60 80]
  [63 83]
  [63 83]]]


In [64]:
    question_wv = []
    temp = []
    for j in range(len(tokenized_questions[i]['sentence_tokens'])):
        try:
            temp.append(word_vectors[tokenized_questions[i]['sentence_tokens'][j]])
        except:
            temp.append(np.zeros((rc.word_vector_length, )))
    question_wv.append(temp)
    question_wv = np.array(question_wv)
    print(question_wv.shape)
    
    temp = np.zeros((question_wv.shape[0], rc.sentence_length - question_wv.shape[1], rc.word_vector_length))
    temp[:] = 0.
    question_wv = np.concatenate((temp, question_wv), axis=1)
    question_wv = np.repeat(question_wv, array_of_wvs.shape[1], axis=0)
    
    print(question_wv.shape)

(1, 37, 100)
(5, 40, 100)


In [65]:
        for j in range(len(array_of_wvs)):
            prediction = model.predict([array_of_wvs[j], question_wv])
            print(prediction)

[[0.7976213]
 [0.7983011]
 [0.8012408]
 [0.7928511]
 [0.7936725]]
