In [1]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import bert_qa
importlib.reload(bert_qa)
import infer_bert_qa
importlib.reload(infer_bert_qa)
import bert_utils
importlib.reload(bert_utils)
import pandas as pd
from framework import DataCuration, FeatureEngineering
from bert_qa import TaskQA, FeatureEngineeringQA, BERTQA
from bert_maskedLM import BERTMaskedLM

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)
ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Using TensorFlow backend.


Set up the task details. This notebook handles Question Answering for CARTA dataset.

Example

context = "New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland."

questions = "How many people live in New Zealand?", "What's the largest city?"


In [2]:
DATASET = 'carta' # supports w2 and resume
TASK_CONFIG = {
    'task': 'qa'
}

task = TaskQA(TASK_CONFIG)

Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [3]:
CARTA_DATA = [
   '/Users/ahsaasbajaj/Documents/Data/CARTA/Annotated Samples/out/s1_process_files'
]
CARTA_GOLDEN = [
   '/Users/ahsaasbajaj/Documents/Data/CARTA/Annotated Samples/golden/output.csv'
]

GOLDEN_CONFIG = {
    'path': CARTA_GOLDEN,
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': CARTA_DATA,
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/CARTA/Annotated Samples/out/s1_process_files
INFO:root:4 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/CARTA/Annotated Samples/golden/output.csv
INFO:root:Total files Goldens: (4, 9)
INFO:root:Total files found in the source with unique index: (4, 9)
INFO:root:Processing 4 IBOCR files to txt


In [4]:
data.golden

Unnamed: 0_level_0,Number of authorized shares / share class,Number of authorized shares / preferred share type,Cumulative dividends,Dividend rate,Original Issue Price,Liquidation preference / preferred share type,Seniority (Preferred share class),Participation (Preferred share class),Conversion price (Preferred share class)
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
annotated_AOI_2.pdf,"Common Stock: 15,442,630 shares; Preferred Sto...","Series A Preferred Stock: 3,899,551 shares; Se...",False,Series A Preferred Stock: 6% per annum; Series...,Series A Preferred Stock: $1.649 per share; Se...,Series A Preferred Stock: $1.649 per share; Se...,Series A Preferred Stock: 1; Series A-1 Prefer...,,Series A Preferred Stock: $1.649 per share; Se...
annotated_AOI_3.pdf,"Common Stock: 13,000,000 shares; Preferred Sto...","Series Seed-1 Preferred Stock: 910,000 shares;...",False,Series Seed-1 preferred stock: $0.0264 per sha...,Series Seed-1 preferred stock: $0.65 per share...,Series Seed-1 preferred stock: $0.33 per share...,Series Seed-1 preferred stock: 1; Series Seed-...,,Series Seed-1 preferred stock: $0.65 per share...
annotated_AOI_4.pdf,"Common Stock: 16,000,000 shares; Preferred Sto...","Series Seed Preferred Stock: 1,820,119 shares;...",False,Series A Preferred Stock: $0.1044 per annum pe...,Series Seed Preferred Stock: $0.795 per share;...,Series Seed Preferred Stock: $0.795 per share;...,Series Seed Preferred Stock: 1; Series A Prefe...,Series A Preferred Stock: $2.6098 per share,Series Seed Preferred Stock: $0.795 per share;...
annotated_AOI_5.pdf,"Common Stock: 18,527,000 shares; Preferred Sto...","Series Seed Preferred Stock: 2,575,871 shares;...",False,Not defined,Series Seed Preferred Stock: $ 1.0676 per shar...,Series Seed Preferred Stock: $ 1.0676 per shar...,Series Seed Preferred Stock: 1; Series A Prefe...,,Series Seed Preferred Stock: $ 1.0676 per shar...


In [5]:
open_queries = [ 
                "Who is incorporating the company?",
                "How many shares are being created?",
                "What are the Common stocks?",
                "What are the Preferred stocks?",
                "What are the Non-cumulative dividends?",
                "What is the Dividend rate per annum per preferred share type?",
                "Number of authorized shares or share class?",
                "Original Issue Price per share?",
                "Liquidation preference or preferred share type?"
                ]

closed_queries = [ 
                "The company is incorporated by",
                "The number of shares being created are",
                "The common stocks are",
                "The Preferred stocks are",
                "The Non-cumulative dividends are",
                "The Dividend rate per annum per preferred share type are",
                "The number of authorized shares are",
                "The Original Issue Price per share is",
                "The Liquidation preference is"
                ]

In [6]:
DATA_ARGS = {
    'task': task,
    'dataset': data,
    'is_closed_query': False  # if False, then use BERTQA, otherwise use BERTMaskedLM 
}

if DATA_ARGS['is_closed_query']:
    # Question Answering using Masked Language Model 
    queries = closed_queries
    queries = open_queries
    TRAINING_ARGS = {
    'model_file_or_path': "bert-large-uncased", # finetuned checkpoint available directly
    'gpu': False,
    'output_dir': '../outputs/bert_maskedLM'
    }

    model = BERTMaskedLM(DATA_ARGS, TRAINING_ARGS)
    output = model.predict(queries)
else:
    # Standard Question Answering Model
    queries = open_queries
    TRAINING_ARGS = {
    'model_file_or_path': "bert-large-uncased-whole-word-masking-finetuned-squad", # finetuned checkpoint available directly
    'gpu': False,
    'output_dir': '../outputs/bert_qa'
    }

    model = BERTQA(DATA_ARGS, TRAINING_ARGS)
    output = model.predict(queries)


INFO:root: Total number of Files: 4
INFO:root:File name: annotated_AOI_4.pdf
Empty DataFrame
Columns: [filename, Who is incorporating the company?, How many shares are being created?, What are the Common stocks?, What are the Preferred stocks?, What are the Non-cumulative dividends?, What is the Dividend rate per annum per preferred share type?, Number of authorized shares or share class?, Original Issue Price per share?, Liquidation preference or preferred share type?]
Index: []
convert squad examples to features: 100%|██████████| 9/9 [00:28<00:00,  3.16s/it]
add example index and unique id: 100%|██████████| 9/9 [00:00<00:00, 17339.80it/s]


KeyboardInterrupt: 