In [1]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, concatenate, Input, Reshape, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
import csv
import os
import errno
import operator
import sys
import pickle
import json
from pprint import pprint
import re

Using TensorFlow backend.


#### parsing train file

In [24]:
with open('data/train-v1.1.json', 'r') as f:
     data = json.load(f)

In [None]:
data

#### parsing test file

In [4]:
with open('data/test-v1.1.json', 'r') as f:
     data = json.load(f)

## start preprocessing

In [25]:
# 1 data --> many paragraphs --> 1 context, many qas,
context_qas_list = []
for data_entry in data['data']:
    for context_qas in data_entry['paragraphs']:
        context_qas_list.append(context_qas)

In [26]:
context_qas_list[10]

{'context': '作為近代革命發源地之一，廣州自中華民國代時就是中國社會運動的中心之一。每次全國性的社會運動都有廣州民眾的響應和參與。以廣州為中心的較具規模的社會運動，最早有1925年至1926年在廣州和香港同時舉辦的省港大罷工。\n廣州市民在1989年更發起活動聲援天安門民主運動，百萬人聚集海珠廣場圍繞廣州解放紀念碑集會。初期廣州媒體以「愛國運動」名義支持。流血事件發生後，民眾暴動，省市政府機關均被衝擊，所有廣州媒體隨即被禁言，亦干擾和封鎖香港電視台。省政府出動軍警鎮壓遊行群眾，大批參與者偷渡至香港、台灣及海外。各企事業單位亦派員審查各部門職工是否有組織或參與集會。\n1999年的全國性反美活動，有數十萬群眾、學生在市內遊行示威，抗議北約轟炸中國駐南斯拉夫大使館，同時駐廣州美國領事館也受到部分激進示威人士破壞。廣州媒體對此進行全程跟進，但對廣州美國領館破壞情況則完全沒有提及。\n2005年的全國性反日示威，也有數十萬人在主幹道遊行，不過廣州封鎖消息，大學和中學也禁止學生遊行，否則開除學籍。',
 'qas': [{'answers': [{'answer_start': 93, 'text': '香港'}],
   'id': 'e5d3c55d-41cd-42c6-93f9-e90186c38a7f',
   'question': '省港大罷工是除了廣州以外還有哪個地區參與？'},
  {'answers': [{'answer_start': 161, 'text': '愛國運動'}],
   'id': '22a717ee-6026-4e1e-818b-1e1678b8947f',
   'question': '廣州媒體聲援天安門民主運動一開始以甚麼名義？'},
  {'answers': [{'answer_start': 311, 'text': '北約轟炸中國駐南斯拉夫大使館'}],
   'id': '0897c133-c395-484f-b37d-d057ea8323a3',
   'question': '在全國性反日示威的6年前的抗議活動是在抗議甚麼？'}]}

In [80]:
# True (better F1 score): "see if question word is in segment"
# False: "see if segment word is in question"
find_question_chars_in_segment = True
processing_train_data = True
print_questions_and_segments = False
append_another_segment_after = True # append another segment after most_similar_segment
append_another_segment_before = False # append another segment before most_similar_segment

if not processing_train_data: # test file
    question_id = []
    question_text = []
    # submission_answer = []  ### for when I want to write a submission
    segment_start_position = []
    segment_text= []
else: # train file
    question_text = []
    answer_start_position = []
    answer_text = []
    segment_start_position = []
    segment_text= []

for context_qas in context_qas_list:
    context = context_qas['context']
    context_segments = re.split('。', context) # prev: ，|。
    # print(context)
    
    qas = context_qas['qas']
    for qa in qas:
        qa_id = qa['id']
        qa_question = qa['question']
        qa_answer_text = qa['answers'][0]['text']
        qa_answer_start = qa['answers'][0]['answer_start']
        
        if processing_train_data:
            question_text.append(qa_question)
            answer_start_position.append(qa_answer_start)
            answer_text.append(qa_answer_text)
        
        max_num_char_overlaps = 0
        most_similar_segment = ''
        most_similar_start_id = 0
        current_start_id = 0
        most_similar_segment_idx = 0
        
        for idx, context_segment in enumerate(context_segments):
            if idx > 0:
                current_start_id += len(context_segments[idx-1]) + 1 # add the ，or。removed by regex
                
            num_char_overlaps = 0
            if find_question_chars_in_segment:
                context_segment_chars = set([i for i in context_segment])
                for question_char in qa_question:
                    if question_char in context_segment_chars:
                        num_char_overlaps += 1
            else:
                question_chars = set([i for i in qa_question])
                for context_segment_char in context_segment:
                    if context_segment_char in question_chars:
                        num_char_overlaps += 1
                    
            if num_char_overlaps > max_num_char_overlaps:
                max_num_char_overlaps = num_char_overlaps
                most_similar_segment = context_segment
                most_similar_start_id = current_start_id
                most_similar_segment_idx = idx
            
            ### note: maybe try other similarity measures?
            '''similarity_score = similarity(context_segment, qa['question'])
            if similarity_score > max_similarity_score:
                max_similarity_score = similarity_score
                most_similar_segment = context_segment'''
        
        ### note: remove question chars from answer?
        '''keep_char_in_segment = [True for i in range(0, len(most_similar_segment))] 
        for idx, question_char in enumerate(qa_question):'''
            
        
        # print('question id: ', qa_id)
        
        if print_questions_and_segments:
            print('question: ', qa_question)
            print('answer: ', most_similar_segment)
            print('should be same as answer: ', context[most_similar_start_id:most_similar_start_id+len(most_similar_segment)])
        if not processing_train_data:
            question_id.append(qa_id)
            question_text.append(qa_question)
            segment_text.append(most_similar_segment)
            segment_start_position.append(most_similar_start_id)
            # submission_answer.append(' '.join([str(i) for i in range(most_similar_start_id,most_similar_start_id+len(most_similar_segment))]))
        else: # processing_train_data:
            segment_start_position.append(most_similar_start_id)
            if append_another_segment_after and most_similar_segment_idx+1 < len(context_segments):
                segment_text.append(most_similar_segment + '。' + context_segments[most_similar_segment_idx+1])
            elif append_another_segment_before and most_similar_segment_idx-1 >= 0:
                segment_text.append(context_segments[most_similar_segment_idx-1] + '。' + most_similar_segment)
            else:
                segment_text.append(most_similar_segment)
print('finished')

finished


#### check test file results

In [15]:
peek_idx = 5
print(question_id[peek_idx])
print(question_text[peek_idx])
print(segment_text[peek_idx])
print(segment_start_position[peek_idx])

d08d2f0a-e8a2-4f5d-980c-5bad4a2b434b
黃河為海河流域以及哪個流域的分水嶺?
由於黃河泥沙量大，下遊河段長期淤積形成舉世聞名的「地上河」，黃河約束在大堤內成為海河流域與淮河流域的分水嶺
93


#### check train file results

In [81]:
peek_idx = 111
print('question_text:', question_text[peek_idx])
print('answer_start_position:', answer_start_position[peek_idx])
print('answer_text:', answer_text[peek_idx])
print('segment_start_position:', segment_start_position[peek_idx])
print('segment_text:', segment_text[peek_idx])

question_text: 位於廣州的白雲山有甚麼之稱？
answer_start_position: 99
answer_text: 市肺
segment_start_position: 22
segment_text: 廣州11個市轄區總面積7434.4平方公里。地勢東北高、西南低，背山面海，北部是森林集中的丘陵山區，最高峰為北部從化區與惠州市龍門縣交界處的天堂頂，海拔為1210米；東北部為中低山地，市區有被稱為「市肺」的白雲山；中部是丘陵盆地，南部為沿海沖積平原，為珠江三角洲的組成部分


In [82]:
len(answer_text)

14611

In [83]:
count = 0
bad_idx = []
for idx, answer in enumerate(answer_text):
    if answer_start_position[idx] > segment_start_position[idx] + len(segment_text[idx])-1\
    or answer_start_position[idx] + len(answer_text[idx])-1 < segment_start_position[idx]:
        count += 1
        bad_idx.append(idx)
        '''print('question:', question_text[idx])
        print('answer:', answer_text[idx])
        print('segment:', segment_text[idx])'''
print('finished')

finished


In [84]:
# 0.164 of 1-sentence segments do not cover correct answer
# 0.127 of 2-sentence (extra sentence after) segments do not cover correct answer
# 0.138 of 2-sentence (extra sentence before) segments do not cover correct answer
print(count / len(answer_text))

0.13845732667168573


In [75]:
# takes 10 seconds
question_text = [question_text[i] for i in range(0, len(question_text)) if i not in bad_idx]
answer_start_position = [answer_start_position[i] for i in range(0, len(answer_start_position)) if i not in bad_idx]
answer_text = [answer_text[i] for i in range(0, len(answer_text)) if i not in bad_idx]
segment_start_position = [segment_start_position[i] for i in range(0, len(segment_start_position)) if i not in bad_idx]
segment_text= [segment_text[i] for i in range(0, len(segment_text)) if i not in bad_idx]

In [77]:
len(question_text)

12751

In [78]:
# check that bad training data have been removed
for idx, answer in enumerate(answer_text):
    if answer_start_position[idx] > segment_start_position[idx] + len(segment_text[idx])-1\
    or answer_start_position[idx] + len(answer_text[idx])-1 < segment_start_position[idx]:
        bad_idx.append(idx)
        print('question:', question_text[idx])
        print('answer:', answer_text[idx])
        print('segment:', segment_text[idx])
print('all clear!')

all clear!


#### save test file segments

In [22]:
np.array(question_text)[250]

'戴維提出了氮氣以及哪個單質在常溫常壓下為氣體?'

In [None]:
with open('question_id.pickle', 'wb') as handle:
    pickle.dump(np.array(question_id), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('question_text.pickle', 'wb') as handle:
    pickle.dump(np.array(question_text), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('segment_text.pickle', 'wb') as handle:
    pickle.dump(np.array(segment_text), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('segment_start_position.pickle', 'wb') as handle:
    pickle.dump(np.array(segment_start_position), handle, protocol=pickle.HIGHEST_PROTOCOL)

#### save train file segments

In [79]:
with open('question_text.pickle', 'wb') as handle:
    pickle.dump(np.array(question_text), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('answer_start_position.pickle', 'wb') as handle:
    pickle.dump(np.array(answer_start_position), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('answer_text.pickle', 'wb') as handle:
    pickle.dump(np.array(answer_text), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('segment_start_position.pickle', 'wb') as handle:
    pickle.dump(np.array(segment_start_position), handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('segment_text.pickle', 'wb') as handle:
    pickle.dump(np.array(segment_text), handle, protocol=pickle.HIGHEST_PROTOCOL)

## write submission file

In [8]:
with open('char_overlap_sentence.csv', 'wt') as outfile:
    test_writer = csv.writer(outfile)
    test_writer.writerow(['id','answer'])
    
    for idx in range(0, len(submission_id)):
        test_writer.writerow([submission_id[idx], submission_answer[idx]])
    
print('finished writing submission!')

finished writing submission!
