In [None]:
# importing required modules
from PyPDF2 import PdfReader

# creating a pdf reader object
reader = PdfReader('../data/medicine_books/Harrison’s Principles of Internal Medicine-MAIN-TEXTBOOK.pdf')

# printing number of pages in pdf file
print(len(reader.pages))

# getting a specific page from the pdf file
page = reader.pages[7852]

# extracting text from page
text = page.extract_text()
print(text)

### Testing fitz

In [None]:
import fitz
filename = "../data/medicine_books/Harrison’s Principles of Internal Medicine-MAIN-TEXTBOOK.pdf"
doc = fitz.open(filename)
# start,end = 7852,8162
start,end = 7852, 7854
f = open('sample_block.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("blocks")
    f.write(str(text))

f.close()

## Trying to separate headers and paras

In [None]:
# filename = "../data/medicine_books/Harrison’s Principles of Internal Medicine-MAIN-TEXTBOOK.pdf"
# start,end = 7852,7854

filename = "../data/medicine_books/Davidsons Principles and Practice of Medicine.pdf"
start,end = 666,700

In [None]:
doc = fitz.open(filename)

In [None]:
# f = open('sample_block.txt',"w+")
# text = []
for page in doc.pages(start,end):
    text = page.get_text("blocks", sort = True)
    for block in text:
        print('<block>')
        # print(block[4])
        # print(block["lines"])
        for line in block[4].split('\n'):
            print('<l>',line)
        print('</block>')

    # f.write(str(text))

# f.close()

### Annotating text into different sizes of text

In [None]:
import fitz

In [3]:
import fitz

def fonts(doc, granularity=False, top_k = None):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc.pages(start,end):
        blocks = page.get_text("dict", sort = True)["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    if top_k:
        font_counts = font_counts[:top_k]

    return font_counts, styles

In [None]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


In [None]:
# final line-separated text with size tags

def headers_para_tag(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc.pages(start,end):
        blocks = page.get_text("dict", sort = True)["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        try:
                            if s['size'] in size_tag:
                                size = size_tag[s['size']]
                                if s['text'].strip():  # removing whitespaces:
                                    if first:
                                        # print("first")
                                        previous_s = s
                                        first = False
                                        block_string = {size : s['text']}
                                    else:
                                        if s['size'] == previous_s['size']:
                                            # print("a")

                                            if block_string and all((c == "|") for c in block_string):
                                                # print("b")
                                                # block_string only contains pipes
                                                block_string = {size : s['text']}
                                            if block_string == "":
                                                # print('c')
                                                # new block has started, so append size tag
                                                block_string = {size : s['text']}
                                            else:  # in the same block, so concatenate strings
                                                # print('d')
                                                block_string[size] += " " + s['text']

                                        elif len(block_string) > 0:
                                            # print('e', block_string)

                                            header_para.append(block_string)
                                            # print("s['size'] = ",s['size'])
                                            block_string = {size : s['text']}
                                            # print(block_string)

                                        previous_s = s
                        except:
                            continue

                if type(block_string) == dict and len(block_string) > 0:
                    header_para.append(block_string)

    return header_para
    # return 0


In [None]:
# final concatenated text without size tags

def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = ""  # string with paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc.pages(start,end):
        blocks = page.get_text("dict", sort = True)["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        try:
                            # modified the code to include chemical formula numbers
                            # chem formulas have numbers below a certain font size, using '6' for now
                            if s['size'] in size_tag or s['size'] <= 6:
                                if s['text'].strip():  # removing whitespaces:
                                    if first:
                                        # print("first")
                                        previous_s = s
                                        first = False
                                        block_string = s['text']
                                    else:
                                        if s['size'] == previous_s['size']:
                                            # print("a")

                                            if block_string and all((c == "|") for c in block_string):
                                                # print("b")
                                                # block_string only contains pipes
                                                block_string = s['text']
                                            if block_string == "":
                                                # print('c')
                                                # new block has started, so append size tag
                                                block_string = s['text']
                                            else:  # in the same block, so concatenate strings
                                                # print('d')
                                                block_string += " " + s['text']

                                        elif len(block_string) > 0:
                                            # print('e', block_string)

                                            header_para += block_string
                                            # print("s['size'] = ",s['size'])
                                            block_string = s['text']
                                            # print(block_string)

                                        previous_s = s
                        except:
                            continue

                if len(block_string) > 0:
                    header_para += block_string + " \n"

    return header_para
    # return 0


In [None]:
# filename = "../data/medicine_books/Harrison’s Principles of Internal Medicine-MAIN-TEXTBOOK.pdf"
# start,end = 7852,8162

# filename = "../data/medicine_books/Davidsons Principles and Practice of Medicine.pdf"
# start,end = 664,754

filename = "../data/medicine_books/Harrisons-Principles-Of-Internal-Medicine-SELF-ASSESSMENT.pdf"
start,end = 492, 558

In [None]:
doc = fitz.open(filename)
font_counts, styles = fonts(doc, granularity=False, top_k=3)
print("font_counts : ",font_counts, "\n")
print("styles : ",styles, "\n")
size_tag = font_tags(font_counts,styles)
print("size_tag : ",size_tag, "\n")
final_text = headers_para(doc,size_tag)
print("FINAL TEXT ---------------------------- \n")
print(final_text)


In [None]:
textbook = "Harrison_qa"
format = "text"
top_k = "top3"

In [None]:
# Save as text

with open("../extracted_text/{}_{}_{}.txt".format(textbook,format,top_k),"w+") as f:
    f.write(final_text)

In [None]:
# Save as json
import json

with open("../extracted_text/{}_{}_{}.txt".format(textbook,format,top_k),"w+") as f:
    json.dump(final_text, f, indent = 4)


In [None]:
print(10)

In [15]:
a = list(map(int,"[b,c,d,e]".strip("[]").split(',')))

ValueError: invalid literal for int() with base 10: 'b'

### Converting both Harrison's and Davidson's textbooks for respiratory system from pdf to plain text form (without annotations)

In [None]:
import fitz
filename = "../data/medicine_books/Harrison's Principles of Internal Medicine-MAIN-TEXTBOOK.pdf"
doc = fitz.open(filename)
start,end = 7852,8162
# start,end = 7852, 7854
f = open('../extracted_text/Harrisons_text.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("text", sort = True)
    f.write(str(text))

f.close()


filename = "../data/medicine_books/Davidsons Principles and Practice of Medicine.pdf"
doc = fitz.open(filename)
start,end = 664,754
# start,end = 7852, 7854
f = open('../extracted_text/Davidsons_text.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("text", sort = True)
    f.write(str(text))

f.close()

### Converting both 250-short-cases and Kumar_&_clark's textbooks for respiratory and endocrine system from pdf to plain text form (without annotations)

In [None]:
import fitz
filename = "../data/medicine_books/short_cases_medicine.pdf"
doc = fitz.open(filename)
start,end = 7852,8162
# start,end = 7852, 7854
f = open('../extracted_text/short_cases_medicine.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("text", sort = True)
    f.write(str(text))

f.close()


filename = "../data/medicine_books/Davidsons Principles and Practice of Medicine.pdf"
doc = fitz.open(filename)
start,end = 664,754
# start,end = 7852, 7854
f = open('../extracted_text/Davidsons_text.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("text", sort = True)
    f.write(str(text))

f.close()

#### Extract Harrison_QA dataset

In [None]:
# extracting the QA self-assessment book

import fitz
filename = "../data/medicine_books/Harrisons-Principles-Of-Internal-Medicine-SELF-ASSESSMENT.pdf"
doc = fitz.open(filename)
start,end = 492, 558
# start,end = 7852, 7854
# f = open('../extracted_text/Harrisons_text.txt',"w+")

final = ""
for page in doc.pages(start,end):
    text = page.get_text("text", sort = True)
    # print("BLOCK---------------------------------------")
    # print(text)
    final += text


    # f.write(str(text))

In [None]:
ques,ans = final.split("ANSWERS")
print(ques)
print(ans)

In [None]:
print(ans)

In [None]:
# Saved "final" to "mcqs_only.txt" and removed everything that was not mcqs manually

f = open("../extracted_text/Harrison_qa/mcqs_only.txt", "r")
text = f.readlines()[0]
text = text.replace("\\n",'\n')
f.close()

In [None]:
ques, ans = text.split("ANSWERS")

#### Extracting questions

In [None]:
q_split = q_split[1:]
print(q_split)

In [None]:
a_split = a_split[1:]
print(a_split)

In [None]:
for i in q_split:
    print(i.split("\n"))

In [None]:
#perfect

len(q_split)

In [None]:
len(a_split)

In [None]:
q_json = []
for i in range(int(len(q_split)/2)):
    q_dict = {}
    q_dict['q_no'] = q_split[2*i]
    pattern = r"\n[A-Z].\s\s\s"
    # remove \n chars from every element of the split
    ques_options = [x.replace("\n"," ") for x in re.split(pattern, q_split[2*i+1])]
    
    q_dict['question'], q_dict['options'] = ques_options[0], ques_options[1:]
    q_dict['answer'] = ""
    q_dict['explanation'] = ""

    q_json.append(q_dict)

In [None]:
q_json

In [None]:
import json
with open("../extracted_text/Harrison_qa/qa.json",'w+') as f:
    json.dump(q_json,f, indent = 4)

#### extracting answers

In [2]:
import json
f = open("../extracted_text/Harrison_qa/qa.json","r+")
data = json.load(f)
f.close()
data[:5]

[{'q_no': '1',
  'question': 'All of the following are typically characterized as an obstructive lung disease EXCEPT:',
  'options': ['Asbestosis',
   'Asthma',
   'Bronchiectasis',
   'Chronic bronchitis',
   'Emphysema'],
  'answer': 'A',
  'explanation': 'is a lung disease caused by the\ninhalation of asbestos fibers. It is a fibrotic lung disease and typically manifests with a restrictive\nventilator defect and gas transfer defect (reduced diffusion capacity for carbon monoxide\n[DLCO]) on pulmonary function testing.'},
 {'q_no': '2',
  'question': 'A 25-year-old man is brought to the emergency department by ambulance after his family found him unresponsive at home. He has a history of intravenous drug abuse and human immunodeficiency virus (HIV) with medical noncompliance. His last CD4 count was <200/μL. On initial evaluation, his blood pressure is 120/75 mmHg, heart rate is 105 bpm, respiratory rate 8 breaths/min, oxygen saturation (SaO2) of 83%, and temperature of 36.0°C. A bloo

In [None]:
a_split

In [None]:
j = 0
for i in range(int(len(a_split)/2)):
# for i in range(3):
    while j < len(data):
        # print(data[j])
        # print(data[j]['q_no'])
        # print(a_split[2*i])
        if data[j]['q_no'] == a_split[2*i]:
            answer = a_split[2*i + 1].strip()
            pattern = r'^The\sanswer\sis\s[A-E].'
            if re.search(pattern,answer):
                ans_letter = answer.split()[3][0]
                # print(ans_letter) 
                # add answer letter to data
                # print(ans_letter)
                data[j]['answer'] = ans_letter
                pattern = r'\(Chap.\s([0-9][0-9][0-9]|[0-9]|[0-9][0-9])*.\)\s'
                exp = re.split(pattern,answer)
                # print(exp[-1])
                data[j]['explanation'] = exp[-1]
            # print("j = ",j)
            # print(data[j]['q_no'],a_split[2*i])
            j += 1
            break
        else:
            # print("j = ",j)
            # print(data[j]['q_no'],a_split[2*i])
            j += 1

In [None]:
data

In [None]:
a_split

In [3]:
keywords = ["Figure","figure","Table","table"]

In [57]:
for i,que in enumerate(data):
    if any(keyword in que['question'] or keyword in que['explanation'] for keyword in keywords):
        del data[i]

In [58]:
len(data)

68

In [7]:
data[:5]

[{'q_no': '1',
  'question': 'All of the following are typically characterized as an obstructive lung disease EXCEPT:',
  'options': ['Asbestosis',
   'Asthma',
   'Bronchiectasis',
   'Chronic bronchitis',
   'Emphysema'],
  'answer': 'A',
  'explanation': 'is a lung disease caused by the\ninhalation of asbestos fibers. It is a fibrotic lung disease and typically manifests with a restrictive\nventilator defect and gas transfer defect (reduced diffusion capacity for carbon monoxide\n[DLCO]) on pulmonary function testing.'},
 {'q_no': '2',
  'question': 'A 25-year-old man is brought to the emergency department by ambulance after his family found him unresponsive at home. He has a history of intravenous drug abuse and human immunodeficiency virus (HIV) with medical noncompliance. His last CD4 count was <200/μL. On initial evaluation, his blood pressure is 120/75 mmHg, heart rate is 105 bpm, respiratory rate 8 breaths/min, oxygen saturation (SaO2) of 83%, and temperature of 36.0°C. A bloo

In [10]:
unans = []
for que in data:
    if(que['answer'] == ""):
        unans.append(que['q_no'])
        
print(unans)

['21', '22', '30', '31', '41', '43', '44', '65', '93', '94', '95', '96', '98', '99']


In [None]:
needed = [40,64]

In [15]:
data[32]

{'q_no': '41',
 'question': 'What is the recommended treatment for the patient in Question VI-40?',
 'options': ['Azathioprine 125 mg daily plus prednisone 60 mg daily',
  'Cyclophosphamide 100 mg daily',
  'Nintedanib 150 mg twice a day',
  'Prednisone 60 mg daily',
  'No therapy is effective for treatment of idiopathic pulmonary fibrosis.'],
 'answer': '',
 'explanation': ''}

In [20]:
data[53]

{'q_no': '65',
 'question': 'If a lung biopsy were to be taken 4 days after admission in the patient described in Question VI- 64, which statement correctly identifies the expected findings?',
 'options': ['Diffuse alveolar damage with hyaline membranes and protein-rich edema fluid in alveoli',
  'Extensive eosinophil-rich infiltrate with protein-rich edema fluid',
  'Extensive fibrosis of the alveolar ducts with development of bullae',
  'Homogeneous infiltrate of neutrophils and leukocytes affecting all alveolar spaces',
  'Proliferation of type II pneumocytes and presence of a lymphocyte-rich pulmonary infiltrate'],
 'answer': '',
 'explanation': ''}

In [21]:
del data[32]
del data[53]

In [23]:
idx_data = {}
for que in data:
    idx_data[que['q_no']] = que

In [None]:
idx_data['20']

### Printing to final text format

In [55]:
def check_is_ans(ans):
    if ans != "":
        return "Option (" + str(ord(ans) - 64) + ")"
    else:
        return ""

final_text = ""
    
for que in idx_data:
    # if(int(que) > 5):
    #     break
    options = ""
    for num,opt in enumerate(idx_data[que]['options'],start=1):
        options += str(num) + ". " + opt + "\n"
    final_text += f'''Question : {idx_data[que]['question']}
Options :
{options}   
Answer : {check_is_ans(idx_data[que]['answer'])}
Explanation : {idx_data[que]['explanation']}

'''

In [None]:
print(final_text)

##### saving

## Multidoc2dial

### Extracting documents

In [4]:
import json
with open("../data/multidoc2dial/multidoc2dial_doc.json") as f:
    text = json.load(f)

In [None]:
text['doc_data']['dmv']

In [2]:
for domain_name in text['doc_data']:
    domain = text['doc_data'][domain_name]
    documents = [domain[document]['doc_text'] for document in domain]
    print(domain_name)
    print("Total words :", sum(([len(doc) for doc in documents])))

# dmv has most words

ssa
Total words : 460662
va
Total words : 595772
dmv
Total words : 734866
studentaid
Total words : 523313


### Converting json file to text

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [11]:
final  = ""
for domain_name in text['doc_data']:
    domain = text['doc_data'][domain_name]
    # documents = [domain[document]['doc_text'] for document in domain]
    for document in domain:
        doc_text = domain[document]['doc_text'].replace("\n\n","\n")
        final += f"title : {domain[document]['title']}\ndocument : {doc_text}\n\n"
    print(domain_name)

ssa
va
dmv
studentaid


In [33]:
with open("../extracted_text/multidoc2dial/title_doc.txt","w+") as f:
    f.write(final)

In [60]:
train_text  = ""
test_text = ""
for domain_name in text['doc_data']:
    domain = text['doc_data'][domain_name]

    data = []
    data += (domain[item] for item in domain)
    
    train, test = train_test_split(data, test_size = 0.15)
    # print(train[5])
    for document in train:
        doc_text = re.sub(r"(\n)+",r" ",document['doc_text'])
        train_text += f"title : {document['title']}    Document : {doc_text}\n"
    for document in test:
        doc_text = re.sub(r"(\n)+",r" ",document['doc_text'])
        test_text += f"title : {document['title']}    Document : {doc_text}\n"
    print(domain_name)

ssa
va
dmv
studentaid


In [61]:
with open("../extracted_text/multidoc2dial/title_doc_train.txt","w+") as f:
    f.write(train_text)

In [62]:
with open("../extracted_text/multidoc2dial/title_doc_test.txt","w+") as f:
    f.write(test_text)

### Extracting QA

In [None]:
qa_doc = "train"
qa_doc = "test"
qa_doc = "validation"

In [5]:
import json
with open("../data/multidoc2dial/multidoc2dial_dial_test.json") as f:
    text = json.load(f)

dialogs = text['dial_data']

In [12]:
import json
with open("../data/multidoc2dial/multidoc2dial_doc.json") as f:
    docs = json.load(f)

doc_data = docs['doc_data']

#### Only conversations

In [7]:
all_utterances = {}
for domain_name in dialogs:
    domain_utterances = []
    domain = dialogs[domain_name]
    for i in range(len(domain)):
        turn_utterances = []
        turns = domain[i]['turns']
        role = ""
        # doc_names = set()
        for turn in turns:
            # doc = turn['references'][0]['doc_id']
            # for refs in turn['references']:
                # doc_names.add(refs['doc_id'])
            utterance = turn['utterance']
            if role == turn['role']:
                turn_utterances[-1] += ". " + utterance
            else:
                role = turn['role']
                turn_utterances.append(role + " : " + utterance)
                 
        domain_utterances.append(turn_utterances)
    all_utterances[domain_name] = domain_utterances

all_utterances

{'dmv': [['user : What can I do if I forgot to update my address?',
   "agent : Don't forget to that, there are branches you can go to do this.. Please refrence local Maps or our website to find a local branch.",
   'user : What if iI already moved?',
   'agent : Unfortunately, no relevant information is found.',
   'user : Please go to a local branch with a piece of mail and update your address.',
   'agent : Please go to a local branch and update your address.',
   'user : Ok, now about appearing at the hearing, what if I requested a hearing but did not appear or send my affidavit?',
   'agent : The DMV will then suspend your registration immediately.',
   'user : How can I dispute these allegations that I failed to pay toll fees?',
   'agent : You will need to contact the tolling authority that is listed on your Notice of Registration Suspension.',
   'user : Does the tolling authority issue toll violations?',
   'agent : Yes, the tolling authority is responsible for toll violations

#### Dialogs in QA form with docs

In [8]:
# suppose there two consecutive turns taken my an individual, then since we are concatenating such utterances into a single one,
# we are considering the concatenation of both the documents for the whole concatenated utterance, if they are different.

all_utterances = {}
for domain_name in dialogs:
    domain_utterances = []
    domain = dialogs[domain_name]
    for i in range(len(domain)):
        turn_utterances = []
        turns = domain[i]['turns']
        role = ""
        doc = set()
        # doc_names = set()
        for turn in turns:
            # doc += [turn['references'][0]['doc_id']]
            # for refs in turn['references']:
                # doc_names.add(refs['doc_id'])
            utterance = turn['utterance']
            if role == turn['role']:
                turn_utterances[-1]['utterance'] += ". " + utterance
                turn_utterances[-1]['doc'].add(turn['references'][0]['doc_id'])
            else:
                role = turn['role']
                doc = set({turn['references'][0]['doc_id']})
                turn_utterances.append({'utterance' : role + " : " + utterance, 'doc' : doc})
                 
        domain_utterances.append(turn_utterances)
    all_utterances[domain_name] = domain_utterances

all_utterances

{'dmv': [[{'utterance': 'user : What can I do if I forgot to update my address?',
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': "agent : Don't forget to that, there are branches you can go to do this.. Please refrence local Maps or our website to find a local branch.",
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': 'user : What if iI already moved?',
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': 'agent : Unfortunately, no relevant information is found.',
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': 'user : Please go to a local branch with a piece of mail and update your address.',
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': 'agent : Please go to a local branch and update your address.',
    'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
   {'utterance': 'user : Ok, now about appearing at the hearing, what if I 

In [11]:
all_utterances['dmv'][0]

[{'utterance': 'user : What can I do if I forgot to update my address?',
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': "agent : Don't forget to that, there are branches you can go to do this.. Please refrence local Maps or our website to find a local branch.",
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': 'user : What if iI already moved?',
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': 'agent : Unfortunately, no relevant information is found.',
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': 'user : Please go to a local branch with a piece of mail and update your address.',
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': 'agent : Please go to a local branch and update your address.',
  'doc': {'Top 5 DMV Mistakes and How to Avoid Them#3_0'}},
 {'utterance': 'user : Ok, now about appearing at the hearing, what if I requested a hearing but did not a

In [29]:
doc_dial = {}
labels = {}

for domain_name in all_utterances:
    doc_dial_dom = {}
    labels_dom = []
    domain = all_utterances[domain_name]
    for dialog in domain:
        labels_dom_dial = []
        dial_text = ""
        for i in range(len(dialog)//2):
            doc_text = ""
            for doc_name in dialog[2*i + 1]['doc']:
                doc_text += doc_data[domain_name][doc_name]['doc_text'] + '. \n '
            if(i > 0):
                dial_text += dialog[2*i - 1]['utterance'] + '. \n '
            dial_text += dialog[2*i]['utterance'] + '. \n '
            # doc_dial_dom += "Document :\n" + doc_text + "Dialog : \n" + dial_text + "agent : "
            doc_dial_dom['doc'] = doc_text
            doc_dial_dom['dial'] = dial_text
            labels_dom_dial.append((dialog[2*i + 1]['utterance'].strip("agent : ") + '. \n '))
        labels_dom.append(labels_dom_dial)
    labels[domain_name] = labels_dom
    doc_dial[domain_name] = doc_dial_dom

In [30]:
import json
with open("../extracted_text/multidoc2dial/doc_dial_input.json", "w+") as f:
    json.dump(doc_dial, f, indent = 4)

In [31]:
with open("../extracted_text/multidoc2dial/labels.json", "w+") as f:
    json.dump(labels, f, indent = 4)

##### debugging

In [8]:
dialogs = text['dial_data']

In [None]:
dialogs['dmv'][0]

In [23]:
dialogs['dmv'][0]['turns'][0]

{'da': 'query_condition',
 'references': [{'label': 'precondition',
   'id_sp': '4',
   'doc_id': 'Top 5 DMV Mistakes and How to Avoid Them#3_0'}],
 'role': 'user',
 'turn_id': 1,
 'utterance': 'What can I do if I forgot to update my address?'}

In [None]:
all_utterances

## Trying things out

In [None]:
import fitz
filename = "../data/medicine_books/short_cases_medicine.pdf"
doc = fitz.open(filename)
# start,end = 7852,8162
start,end = 177, 178
# f = open('sample_block.txt',"w+")
for page in doc.pages(start,end):
    text = page.get_text("blocks")
    print(text)
#     f.write(str(text))

# f.close()

In [20]:
import fitz
from operator import itemgetter
filename = "../data/medicine_books/short_cases_medicine.pdf"
doc = fitz.open(filename)
# start,end = 7852,8162
start,end = 247,248

In [21]:
styles = {}
font_counts = {}

for page in doc.pages(start,end):
    blocks = page.get_text("dict", sort = True)["blocks"]
    # questions = []
    # q = False
    # a = False
    for b in blocks:  # iterate through the text blocks
        if b['type'] == 0:  # block contains text
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans

                    print(s)


                    # if "Bold" in s['font']:
                    #     if "QUESTION" in s['text']:
                    #         q = True
                    

                    # identifier = "{0}".format(s['size'])
                    # styles[identifier] = {'size': s['size'], 'font': s['font']}

                    # font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

# font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

# if len(font_counts) < 1:
#     raise ValueError("Zero discriminating fonts found!")


{'size': 12.0, 'flags': 0, 'font': 'ArialMT', 'color': 0, 'ascender': 1.0750000476837158, 'descender': -0.29899999499320984, 'text': "·    Poor fixation on lateral gaze - Suker's sign.", 'origin': (85.9800033569336, 82.97998046875), 'bbox': (85.9800033569336, 70.0799789428711, 330.3839416503906, 86.56797790527344)}
{'size': 12.0, 'flags': 0, 'font': 'ArialMT', 'color': 0, 'ascender': 1.0750000476837158, 'descender': -0.29899999499320984, 'text': "·    Dilatation of pupil with weak adrenaline solution - Loewi's sign.", 'origin': (85.9800033569336, 96.0), 'bbox': (85.9800033569336, 83.0999984741211, 433.7519226074219, 99.58799743652344)}
{'size': 12.0, 'flags': 0, 'font': 'ArialMT', 'color': 0, 'ascender': 1.0750000476837158, 'descender': -0.29899999499320984, 'text': "·    Jerky pupillary contraction to consensual light - Cowen's sign.", 'origin': (85.9800033569336, 109.02001953125), 'bbox': (85.9800033569336, 96.1200180053711, 429.0719299316406, 112.60801696777344)}
{'size': 12.0, 'fla

In [7]:
font_counts

[('7.999899864196777', 56), ('9.999799728393555', 4), ('12.0', 2)]