In [None]:
import json

In [None]:
with open('../dev-v2.0.json', 'r') as inp_reader:
    data = json.load(inp_reader)["data"]

In [None]:
data[0]['paragraphs'][0]['context']

In [None]:
data[0]

### Converting Wefox page xml to SQuad format

In [90]:
import pagexmltools
import os
import pagexml
import pandas as pd
import traceback
import json
import multiprocessing as mp
import uuid as ud
import numpy as np

In [5]:
inp_dir = '/Users/nischal/code/wefox/'

In [6]:
pxml = pagexml.PageXML()

In [7]:
def read_merged_xml_get_entities_text(merged_xml):
    pxml.loadXml(merged_xml)
    out_words_entities = []
    entities = []
    for page in pxml.select('//_:Page'):
        page_number = pxml.getPageNumber(page)
        #print("Processing page %s" %(page_number,))
        textlines = pxml.select('.//_:TextLine', page)
        idx_textlines = pxml.getLeftRightTopBottomReadingOrder(textlines, fake_baseline=True)
        for text_idx in idx_textlines[0]:
            words = pxml.select('_:Word', textlines[text_idx])
            for word in words:
                current_entity = pxml.getPropertyValue(word, 'entity')
                word_entity = {}
                word_entity['word'] = pxml.getTextEquiv(word)
                word_entity['entity'] = current_entity
                word_entity['page'] = page_number
                word_entity['file_name'] = merged_xml
                out_words_entities.append(word_entity)
               
    df = pd.DataFrame(out_words_entities)
    return df
    

In [15]:
def generate_squad_per_page(df):
    page_numbers = df.page.unique()
    data_squad = {}
    paragraphs = []
    only_paragraphs = []
    for page_number in page_numbers:
        #print("Processing page for squad conversion %d" %(page_number,))
        page_df = df.loc[(df.page == page_number)]
        paragraph = page_df.word.str.cat(sep=' ')
        questions, answers, answer_start_index = merge_entity_values(page_df)
        question_answer_df = pd.DataFrame([questions, answers, answer_start_index]).T
        question_answer_df.columns = ['question', 'text', 'answer_start']
        questions_answers = generate_questions_answers_in_squad_form(question_answer_df)
        context_qas = add_paragraphs_in_squad_form(paragraph=paragraph, question_answers=questions_answers)
        paragraphs.append(context_qas)
        only_paragraphs.append(paragraph)
    
    data_squad['title'] = df['file_name'][0] 
    data_squad['paragraphs'] = paragraphs
    return data_squad, only_paragraphs
        
        
        

In [16]:
def merge_entity_values(df):
    try:
        questions = []
        answers = []
        answer_start_index = []
        if df.shape[0] > 1:
            i = 0
            
            while i < df.shape[0] - 1:
                entity = df.iloc[i]['entity']
                if entity == 'O':
                    if i == df.shape[0] - 1:
                        break
                    i += 1
                    continue
                else:
                    text_concat = ''
                    j = i
                    #print("j  %d" %(j,))
                    answer_start = j
                    while j < df.shape[0] - 1:
                        temp_entity = df.iloc[j]['entity']
                        if temp_entity != entity:
                            questions.append("What is the %s ?" %(entity,))
                            answers.append(text_concat)
                            answer_start_index.append(answer_start)
                            break
                        else:
                            text_concat = text_concat + ' ' + df.iloc[j]['word']
                        j +=1

                    i = j
                    #print("i is %d"%(i,))
                i +=1
        else:
            entity = df.iloc[0]['entity']
            if entity != 'O':
                questions.append("What is the %s ?" %(entity,))
                answers.append(df.iloc[0]['word'])
                answer_start_index.append(df.index[0])

        return questions, answers, answer_start_index
    except Exception as e:
        print(traceback.format_exc())

In [17]:
def generate_questions_answers_in_squad_form(df):
    questions_answers = []
    questions = df['question'].unique()
    for question in questions:
        answers_index_df = df.loc[(df['question'] == question)][['text', "answer_start"]]
        answers_index_df = answers_index_df.drop_duplicates(subset=['text'], keep='first')
        num_of_answers = answers_index_df.shape[0]
        total_answers_required = 3
        if num_of_answers < total_answers_required:
            difference = total_answers_required - num_of_answers
            answers_index_df = answers_index_df.append([answers_index_df]*difference, ignore_index=True)
        else:
            answers_index_df = answers_index_df.iloc[:3]
        
        question_answer_dict = {}
        question_answer_dict['question'] = question
        question_answer_dict['id'] = str(ud.uuid4())
        question_answer_dict['answers'] = json.loads(answers_index_df.to_json(orient='records'))
        
        questions_answers.append(question_answer_dict)
    return questions_answers

In [18]:
def add_paragraphs_in_squad_form(paragraph, question_answers):
    context_qas = {}
    context_qas['qas'] = question_answers
    context_qas['context'] = paragraph
    return context_qas


In [19]:
test_merged_xml = '/Users/nischal/code/wefox/41bbf415-e8e1-4ba5-b6c8-1c2f069de9e3/merged.xml'
text_entity_df = read_merged_xml_get_entities_text(test_merged_xml)
data_in_squad_form, para = generate_squad_per_page(text_entity_df)

In [21]:
uuid_issues = []
to_process_uuids_xml = []
for uuid in os.listdir(inp_dir):
    uuid_path = os.path.join(inp_dir, uuid)
    if os.path.isdir(uuid_path) and uuid != '.git':
        print("Processing directory %s" %(uuid,))
        merged_xml = os.path.join(uuid_path, 'merged.xml')
        if os.path.exists(merged_xml)  :
            to_process_uuids_xml.append(merged_xml)
        else:
            print("Something went wrong for directory %s"%(uuid,))
            uuid_issues.append(uuid)


Processing directory 3869e076-29e5-413d-b507-4b1c728584b3
Processing directory 08155078-dc01-4ef9-b031-ac75e263a75d
Processing directory b9cc447d-f0d9-410e-bc2b-d4b4052798ea
Processing directory acacc997-5bf9-4453-a8fe-e044a423a185
Processing directory 51c48b55-4699-4a02-bd1a-94469d327e33
Processing directory 20ac9dd0-6565-43da-aa75-ccc4bf33add1
Processing directory 0cd3e399-272c-4cab-8fdf-5f62256074c1
Processing directory 16d45d5d-7490-4ad5-bf62-6cb45d771564
Processing directory cc149776-5dda-4a63-a22c-603b952f9371
Processing directory ae2bd990-ee0a-45a0-9564-7f2781045453
Processing directory bf5e417e-1cda-4628-8060-96e487cc9e13
Processing directory 0196dbce-9044-40a5-b274-0b086a28d95c
Processing directory 777acb7b-0506-4c6e-b662-e30b25ad7574
Processing directory 7f3459f2-bbb2-405c-86f3-4fa947e7f579
Processing directory b39816a0-e742-48a3-a95a-6afbc06b1eaa
Processing directory 2c07cb34-2a65-48fe-b1f9-1dc7679134e6
Processing directory 1b10223d-5b5a-4b2b-bbd5-25745eae250b
Processing dir

In [22]:
len(to_process_uuids_xml)

599

In [23]:
def process_uuid(uuid_xml):
    text_entity_df = read_merged_xml_get_entities_text(merged_xml)
    data_in_squad_form, paragraphs = generate_squad_per_page(text_entity_df)
    return data_in_squad_form, paragraphs

In [25]:
pool = mp.Pool(processes=16)
results = [pool.apply_async(process_uuid, args=(xml,)) for xml in to_process_uuids_xml[:len(to_process_uuids_xml) - 50]]
squad_dataset_data_train = [process.get() for process in results]


results_predict = [pool.apply_async(process_uuid, args=(xml,)) for xml in to_process_uuids_xml[len(to_process_uuids_xml) - 50:len(to_process_uuids_xml)]]
squad_dataset_data_predict = [process.get() for process in results_predict]

In [26]:
print(len(squad_dataset_data_train), len(squad_dataset_data_predict))

549 50


In [29]:
squad_dataset_data_train[0][1]

["Versicherungsschein ERGO Hausratversicherung hR-SV 92726492.7-88195-0157 Versicherungsnehmer Frau Susen Molter Müggel seedarnm 176 12587 Berlin Verslcherungsort Müggelseedamm 176, 12587 Berlin Wohnfläche 100 qm Die Wohnung befindet sich im Obergeschoss. Pie Wohnung befindet sich in der 2. Ftage. Versichert Ist ' irr« Rahmen der Allgemeinen Hausse L-Versicherungsbed ngungen - (VB6 2006 Stand 1.7.2006) der Hausrat in einer ständig bewohnten Wohnung in einem Mehrfamilienhaus zum Wiederbeschaffungswert. Versicherungsumfang Wertpaket Deckung B Versicherungssumme: 65.000 EUR Versicherte Gefahren; Feuer Einbruchdiebstahl, Raub, Vandalismus nach Eirbrucl oder Raub Leitungswasser Sturm und Hagel W'eitere E' emehtargefabrgn Elementargefahnen: Für «Te weiteren £iementargefahren gilt je Versicherungsfall eine Seibstnetei- ligung von 500 EUR (§27. Nr. 6 VHB 2006). Der Versicherungsschutz enthält zusätzlich: - - Überspannungsschäden durch Elitz bis 10% der Versicherungssumme Klausel HR0G40 - - Sen

In [33]:
squad_dataset_data_train_qas = []
squad_dataset_data_predict_qas = []
paragraphs_fasttext = []
for i in range(len(squad_dataset_data_train)):
    squad_dataset_data_train_qas.append(squad_dataset_data_train[i][0])
    paragraphs_fasttext.extend(squad_dataset_data_train[i][1])
    
for i in range(len(squad_dataset_data_predict)):
    squad_dataset_data_predict_qas.append(squad_dataset_data_predict[i][0])
    paragraphs_fasttext.extend(squad_dataset_data_predict[i][1])

In [34]:
squad_dataset_final_train = {}
squad_dataset_final_train['data'] = squad_dataset_data_train_qas
squad_dataset_final_train['version'] = 'v1.0'


squad_dataset_final_predict = {}
squad_dataset_final_predict['data'] = squad_dataset_data_predict_qas
squad_dataset_final_predict['version'] = 'v1.0'

In [35]:
paragraphs_fasttext

["Versicherungsschein ERGO Hausratversicherung hR-SV 92726492.7-88195-0157 Versicherungsnehmer Frau Susen Molter Müggel seedarnm 176 12587 Berlin Verslcherungsort Müggelseedamm 176, 12587 Berlin Wohnfläche 100 qm Die Wohnung befindet sich im Obergeschoss. Pie Wohnung befindet sich in der 2. Ftage. Versichert Ist ' irr« Rahmen der Allgemeinen Hausse L-Versicherungsbed ngungen - (VB6 2006 Stand 1.7.2006) der Hausrat in einer ständig bewohnten Wohnung in einem Mehrfamilienhaus zum Wiederbeschaffungswert. Versicherungsumfang Wertpaket Deckung B Versicherungssumme: 65.000 EUR Versicherte Gefahren; Feuer Einbruchdiebstahl, Raub, Vandalismus nach Eirbrucl oder Raub Leitungswasser Sturm und Hagel W'eitere E' emehtargefabrgn Elementargefahnen: Für «Te weiteren £iementargefahren gilt je Versicherungsfall eine Seibstnetei- ligung von 500 EUR (§27. Nr. 6 VHB 2006). Der Versicherungsschutz enthält zusätzlich: - - Überspannungsschäden durch Elitz bis 10% der Versicherungssumme Klausel HR0G40 - - Sen

In [36]:
paragraphs_df = pd.DataFrame(paragraphs_fasttext)

In [37]:
paragraphs_df.head()

Unnamed: 0,0
0,Versicherungsschein ERGO Hausratversicherung h...
1,Versicherungsschein ERGO Seite 2 zum Versicher...
2,Versicherungsschein ERGO Hausratversicherung h...
3,Versicherungsschein ERGO Seite 2 zum Versicher...
4,Versicherungsschein ERGO Hausratversicherung h...


In [38]:
paragraphs_df.to_csv('WeFox_paragraphs_fastText.csv', header=None, index=None, sep="\t")

In [39]:
from fastText import train_unsupervised

In [43]:
fast_text_model = train_unsupervised('WeFox_paragraphs_fastText.csv', model='skipgram', dim=100, minCount=0)

In [45]:
distinct_words = {}
for para in paragraphs_fasttext:
    para_split = para.split(' ')
    for word in para_split:
        if word in distinct_words.keys():
            count = distinct_words[word]
            count = count + 1
            distinct_words[word] = count
        else:
            distinct_words[word] = 1

In [46]:
len(distinct_words.keys())

267

In [47]:
distinct_words

{'Versicherungsschein': 2396,
 'ERGO': 2396,
 'Hausratversicherung': 599,
 'hR-SV': 599,
 '92726492.7-88195-0157': 599,
 'Versicherungsnehmer': 1198,
 'Frau': 599,
 'Susen': 599,
 'Molter': 599,
 'Müggel': 599,
 'seedarnm': 599,
 '176': 599,
 '12587': 1198,
 'Berlin': 1198,
 'Verslcherungsort': 599,
 'Müggelseedamm': 599,
 '176,': 599,
 'Wohnfläche': 599,
 '100': 599,
 'qm': 599,
 'Die': 1198,
 'Wohnung': 1797,
 'befindet': 1198,
 'sich': 2396,
 'im': 599,
 'Obergeschoss.': 599,
 'Pie': 599,
 'in': 2396,
 'der': 6589,
 '2.': 599,
 'Ftage.': 599,
 'Versichert': 599,
 'Ist': 599,
 "'": 599,
 'irr«': 599,
 'Rahmen': 599,
 'Allgemeinen': 599,
 'Hausse': 599,
 'L-Versicherungsbed': 599,
 'ngungen': 599,
 '-': 7188,
 '(VB6': 599,
 '2006': 599,
 'Stand': 599,
 '1.7.2006)': 599,
 'Hausrat': 599,
 'einer': 1198,
 'ständig': 599,
 'bewohnten': 599,
 'einem': 599,
 'Mehrfamilienhaus': 599,
 'zum': 2396,
 'Wiederbeschaffungswert.': 599,
 'Versicherungsumfang': 599,
 'Wertpaket': 599,
 'Deckung': 5

In [None]:
with open('WeFox_Squad_dataset_train.json', 'w') as outFile:
    json.dump(squad_dataset_final_train, outFile)

with open('WeFox_Squad_dataset_predict.json', 'w') as outFile:
    json.dump(squad_dataset_final_predict, outFile)

In [124]:
word_vector_list = []
np.set_printoptions(suppress=True)
for word in fast_text_model.get_words():
    word_vector = {}
    vector_list = fast_text_model.get_word_vector(word)
    dim_string = ''
    for i in range(len(vector_list)):
        dim_string = dim_string + ' ' + str(vector_list[i])
    
   
    word_vector['word_vector'] = word.lower().strip() + '' + dim_string
    word_vector_list.append(word_vector)  

In [125]:
word_vector_df = pd.DataFrame(word_vector_list, columns=['word_vector'])

In [126]:
word_vector_df.head()

Unnamed: 0,word_vector
0,- -0.15759918 0.24406612 -0.31633258 0.0260743...
1,der 0.046573844 0.16251187 -0.31708872 -0.2508...
2,dem -0.5551913 -0.39107883 0.15612249 0.247046...
3,ablauf -0.37883052 0.6158036 0.2578795 -0.0243...
4,versicherung -0.14566478 0.063038945 -0.063312...


In [127]:
word_vector_df.iloc[1]['word_vector']

'der 0.046573844 0.16251187 -0.31708872 -0.2508458 -0.1003648 -0.15456484 0.11574552 -0.016841821 0.19387129 0.23188803 -0.07396908 0.008859005 0.6393118 -0.10937612 0.044619877 -0.17440408 -0.05710945 0.475031 -0.057807237 0.06652439 -0.24297076 0.02833321 0.16775289 -0.32250044 -0.033533268 -0.28480178 -0.26019105 -0.4239969 0.5709709 0.2066691 -0.3614196 0.089651994 -0.26159316 -0.00017568257 -0.4502225 0.026802352 -0.062141966 -0.13981026 -0.24432075 0.30694795 -0.06125808 0.018587178 0.34674424 -0.0048086774 -0.00022722782 0.11030903 0.15599875 -0.045676712 -0.38200882 -0.2697578 -0.16372749 -0.04252585 -0.53887033 0.21303162 0.03352768 0.030619375 -0.22974372 0.41768113 0.0076583456 -0.030838916 -0.46383873 0.04661597 -0.19288106 0.38698003 -0.08583952 0.056157455 0.184459 0.3019709 -0.08597002 0.22293574 -0.22152421 0.16694838 -0.037887823 -0.14088254 0.26594162 -0.2859562 -0.32647577 0.25932485 0.34706506 0.18762794 -0.0138805555 0.05435461 0.7168487 0.6878826 -0.25216016 -0.16

In [128]:
word_vector_df.to_csv('WeFox_Vectors.txt', sep=" ", index=None, header=None)