In [1]:
import pandas as pd
import numpy as np
import pickle, logging, spacy, sys, os, json, requests
import matplotlib.pyplot as plt

from helpers.classes import Collection
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
# DATASETS = os.listdir('datasets')

# with open('pickles/collection_20210624_194932.pkl', 'rb') as f:
#     collection = pickle.load(f)



# # I should completely remove the other bulletins
# # for now this has to do...
# bulletin_names = list(collection.bulletins.keys())
# # target bulletins are those with usable related datasets and main points
# target_bulletins = []
# dictionary = dict()
# for bulletin in bulletin_names:
#     if len(collection.bulletins.get(bulletin).get('main-points')) and len(collection.bulletins.get(bulletin).get('related-datasets')) > 0:
#         target_bulletins.append(bulletin)

In [5]:
from helpers.cloze_generation import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator, noun_phrase_answer_generator as np_answer_generator

df = pd.read_pickle('pickles/dataset_20210625_184837.pkl')
clozes_df = pd.read_json('pickles/clozes_20210715_212425.json')

In [6]:
df.head()

Unnamed: 0,bulletin,type,point,data
0,businessindustryandtrade/business/businessserv...,date_and_percent,"In 2019, approximate gross value added at basi...",[/businessindustryandtrade/business/businessse...
1,businessindustryandtrade/business/businessserv...,date_and_percent,"The non-financial services sector, which accou...",[/businessindustryandtrade/business/businessse...
2,businessindustryandtrade/business/businessserv...,date_and_percent,Total turnover and purchases of the UK non-fin...,[/businessindustryandtrade/business/businessse...
3,businessindustryandtrade/business/businessserv...,date_and_percent,"Out of the 12 UK regions, 8 regions experience...",[/businessindustryandtrade/business/businessse...
4,businessindustryandtrade/business/businessserv...,date_and_percent,"West Midlands, Yorkshire and The Humber, Scotl...",[/businessindustryandtrade/business/businessse...


## Bert

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
from helpers.configs import CLOZE_MASKS
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# make sure to adjust RUN var 
# this section stores the most confident prediction
# and also keeps track of all unique entities 
RUN = False

if RUN:
    results = []
    entity_set = set()
    for row in tqdm(range(df.shape[0])):
        row_result = []
        clozes = [c for c in generate_clozes_from_point(df['point'][row], ne_answer_generator)]
        [entity_set.add((x.answer_text, x.answer_type)) for x in clozes]
        for cloze in clozes:
            result = check_model(model, tokenizer, cloze.cloze_text)

            answer_given = ''.join(result[0].get('token_str').split(' '))
            confidence = result[0].get('score')
            answer_true = cloze.answer_text

            # saves
            # prediction, confidence score, truth, dataframe row, cloze id
            row_result.append((answer_given, confidence, answer_true, row, cloze.cloze_id))

        results.append(row_result)

    with open('results/bert_base_check_model_july2.json', 'w') as f:
        json.dump(results, f)
    with open('results/bert_base_entity_set_july2.pickle', 'wb') as f:
        pickle.dump(entity_set, f)

    #  here I am saving the entities in a dictionary 
    #  with keys being each different entity category 
    #  such as MONEY, PERCENT and so on with values the unique terms found in our data

    categories = [x[1] for x in list(entity_set)]
    # construct keys
    entities = dict()
    entities = {f'{x}':[] for x in categories if x not in entities}
    # append only unique values
    [entities.get(x[1]).append(x[0]) for x in entity_set if x[0] not in entities.get(x[1])]

    with open('results/bert_base_entity_dictionary.json', 'w') as f:
        json.dump(entities, f)

In [None]:
# load back if you didn't run them

with open('results/bert_base_check_model_july2.json', 'r') as f:
    results = json.load(f)
with open('results/bert_base_entity_set_july2.pickle', 'rb') as f:
    entity_set = pickle.load(f)
with open('results/bert_base_entity_dictionary.json', 'r') as f:
    entities = json.load(f)

In [None]:
count_correct, count_wrong = 0, 0
correct_preds, wrong_preds = [], []

for row in results:
    if len(row):
        for entry in row:
            if entry[0] == entry[2]:
                count_correct += 1
                correct_preds.append(entry[0])
            else:
                count_wrong += 1
                wrong_preds.append((entry[0], entry[2]))

print(f'Total Examples: {count_wrong + count_correct}')
print(f'Correct: {count_correct}, Incorrect: {count_wrong}')
print(f'Percentage Correct: {np.round( ((count_correct / (count_correct+ count_wrong) ) * 100), 3)}%')

In [None]:
print('BertBase Correct Predictions Snippet')
correct_preds[:50]

In [None]:
print('BertBase Incorrect Predictions Snippet')
wrong_preds[:50]

## RoBERTA

In [None]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
# make sure to adjust RUN var 
# this section stores the most confident prediction
# and also keeps track of all unique entities 
RUN = False

if RUN:
    results = []
    entity_set = set()
    for row in tqdm(range(df.shape[0])):
        row_result = []
        clozes = [c for c in generate_clozes_from_point(df['point'][row], ne_answer_generator)]
        [entity_set.add((x.answer_text, x.answer_type)) for x in clozes]
        for cloze in clozes:
            result = check_model(model, tokenizer, cloze.cloze_text)

            answer_given = ''.join(result[0].get('token_str').split(' '))
            confidence = result[0].get('score')
            answer_true = cloze.answer_text

            # saves
            # prediction, confidence score, truth, dataframe row, cloze id
            row_result.append((answer_given, confidence, answer_true, row, cloze.cloze_id))

        results.append(row_result)

    with open('results/roberta_base_check_model_july2.json', 'w') as f:
        json.dump(results, f)
    with open('results/roberta_base_entity_set_july2.pickle', 'wb') as f:
        pickle.dump(entity_set, f)

    #  here I am saving the entities in a dictionary 
    #  with keys being each different entity category 
    #  such as MONEY, PERCENT and so on with values the unique terms found in our data

    categories = [x[1] for x in list(entity_set)]
    # construct keys
    entities = dict()
    entities = {f'{x}':[] for x in categories if x not in entities}
    # append only unique values
    [entities.get(x[1]).append(x[0]) for x in entity_set if x[0] not in entities.get(x[1])]

    with open('results/roberta_base_entity_dictionary.json', 'w') as f:
        json.dump(entities, f)

In [None]:
# load back if you didn't run them

with open('results/roberta_base_check_model_july2.json', 'r') as f:
    results = json.load(f)
with open('results/roberta_base_entity_set_july2.pickle', 'rb') as f:
    entity_set = pickle.load(f)
with open('results/roberta_base_entity_dictionary.json', 'r') as f:
    entities = json.load(f)

In [None]:
count_correct, count_wrong = 0, 0
correct_preds, wrong_preds = [], []

for row in results:
    if len(row):
        for entry in row:
            if entry[0] == entry[2]:
                count_correct += 1
                correct_preds.append(entry[0])
            else:
                count_wrong += 1
                wrong_preds.append((entry[0], entry[2]))

print(f'Total Examples: {count_wrong + count_correct}')
print(f'Correct: {count_correct}, Incorrect: {count_wrong}')
print(f'Percentage Correct: {np.round( ((count_correct / (count_correct+ count_wrong) ) * 100), 3)}%')

In [None]:
print('Roberta Base Correct Predictions Snippet')
correct_preds[:50]

In [None]:
print('Roberta Base Incorrect Predictions Snippet')
wrong_preds[:50]

## Electra

In [None]:
from transformers import ElectraForMaskedLM, ElectraTokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-discriminator')

In [None]:
# make sure to adjust RUN var 
# this section stores the most confident prediction
# and also keeps track of all unique entities 
RUN = False

if RUN:
    results = []
    entity_set = set()
    for row in tqdm(range(df.shape[0])):
        row_result = []
        clozes = [c for c in generate_clozes_from_point(df['point'][row], ne_answer_generator)]
        [entity_set.add((x.answer_text, x.answer_type)) for x in clozes]
        for cloze in clozes:
            result = check_model(model, tokenizer, cloze.cloze_text)

            answer_given = ''.join(result[0].get('token_str').split(' '))
            confidence = result[0].get('score')
            answer_true = cloze.answer_text

            # saves
            # prediction, confidence score, truth, dataframe row, cloze id
            row_result.append((answer_given, confidence, answer_true, row, cloze.cloze_id))

        results.append(row_result)

    with open('results/electra_base_check_model_july2.json', 'w') as f:
        json.dump(results, f)
    with open('results/electra_base_entity_set_july2.pickle', 'wb') as f:
        pickle.dump(entity_set, f)

    #  here I am saving the entities in a dictionary 
    #  with keys being each different entity category 
    #  such as MONEY, PERCENT and so on with values the unique terms found in our data

    categories = [x[1] for x in list(entity_set)]
    # construct keys
    entities = dict()
    entities = {f'{x}':[] for x in categories if x not in entities}
    # append only unique values
    [entities.get(x[1]).append(x[0]) for x in entity_set if x[0] not in entities.get(x[1])]

    with open('results/electra_base_entity_dictionary.json', 'w') as f:
        json.dump(entities, f)

In [None]:
# load back if you didn't run them

with open('results/electra_base_check_model_july2.json', 'r') as f:
    results = json.load(f)
with open('results/electra_base_entity_set_july2.pickle', 'rb') as f:
    entity_set = pickle.load(f)
with open('results/electra_base_entity_dictionary.json', 'r') as f:
    entities = json.load(f)

In [None]:
count_correct, count_wrong = 0, 0
correct_preds, wrong_preds = [], []

for row in results:
    if len(row):
        for entry in row:
            if entry[0] == entry[2]:
                count_correct += 1
                correct_preds.append(entry[0])
            else:
                count_wrong += 1
                wrong_preds.append((entry[0], entry[2]))

print(f'Total Examples: {count_wrong + count_correct}')
print(f'Correct: {count_correct}, Incorrect: {count_wrong}')
print(f'Percentage Correct: {np.round( ((count_correct / (count_correct+ count_wrong) ) * 100), 3)}%')

## Albert

In [None]:
from transformers import AlbertForMaskedLM, AlbertTokenizer
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

In [None]:
# make sure to adjust RUN var 
# this section stores the most confident prediction
# and also keeps track of all unique entities 
RUN = False

if RUN:
    results = []
    entity_set = set()
    for row in tqdm(range(df.shape[0])):
        row_result = []
        clozes = [c for c in generate_clozes_from_point(df['point'][row], ne_answer_generator)]
        [entity_set.add((x.answer_text, x.answer_type)) for x in clozes]
        for cloze in clozes:
            result = check_model(model, tokenizer, cloze.cloze_text)

            answer_given = ''.join(result[0].get('token_str').split(' '))
            confidence = result[0].get('score')
            answer_true = cloze.answer_text

            # saves
            # prediction, confidence score, truth, dataframe row, cloze id
            row_result.append((answer_given, confidence, answer_true, row, cloze.cloze_id))

        results.append(row_result)

    with open('results/albert_base_check_model_july2.json', 'w') as f:
        json.dump(results, f)
    with open('results/albert_base_entity_set_july2.pickle', 'wb') as f:
        pickle.dump(entity_set, f)

    #  here I am saving the entities in a dictionary 
    #  with keys being each different entity category 
    #  such as MONEY, PERCENT and so on with values the unique terms found in our data

    categories = [x[1] for x in list(entity_set)]
    # construct keys
    entities = dict()
    entities = {f'{x}':[] for x in categories if x not in entities}
    # append only unique values
    [entities.get(x[1]).append(x[0]) for x in entity_set if x[0] not in entities.get(x[1])]

    with open('results/albert_base_entity_dictionary.json', 'w') as f:
        json.dump(entities, f)

In [None]:
# load back if you didn't run them

with open('results/albert_base_check_model_july2.json', 'r') as f:
    results = json.load(f)
with open('results/albert_base_entity_set_july2.pickle', 'rb') as f:
    entity_set = pickle.load(f)
with open('results/albert_base_entity_dictionary.json', 'r') as f:
    entities = json.load(f)

In [None]:
count_correct, count_wrong = 0, 0
correct_preds, wrong_preds = [], []

for row in results:
    if len(row):
        for entry in row:
            if entry[0] == entry[2]:
                count_correct += 1
                correct_preds.append(entry[0])
            else:
                count_wrong += 1
                wrong_preds.append((entry[0], entry[2]))

print(f'Total Examples: {count_wrong + count_correct}')
print(f'Correct: {count_correct}, Incorrect: {count_wrong}')
print(f'Percentage Correct: {np.round( ((count_correct / (count_correct+ count_wrong) ) * 100), 3)}%')

In [None]:
print('Albert Base Correct Predictions Snippet')
correct_preds[:50]

In [None]:
print('Albert Base Incorrect Predictions Snippet')
wrong_preds[:50]

# Multi-Token Language Model

In [None]:
with open('results/RobertaForMaskedLM_20210714_192239_results.json', 'r') as f:
    results = json.load(f)
with open('results/RobertaForMaskedLM_20210714_192239_entity_set.pickle', 'rb') as f:
    entity_set = pickle.load(f)
with open('results/RobertaForMaskedLM_20210714_192239_entity_dictionary.json', 'r') as f:
    entities = json.load(f)


from transformers import RobertaForMaskedLM, RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [None]:
count_correct, count_wrong = 0, 0
correct_preds, wrong_preds = [], []

for row in results:
    if len(row):
        for entry in row:
            # THIS IS AN ISSUE!!
            # IT SEEMS THAT THE MODEL PREDICTS A WHITESPACE AT THE START!!!
            if entry[0][0] == ' ':
                entry[0] = entry[0][1:]
            if entry[0] == entry[2]:
                count_correct += 1
                correct_preds.append(entry[0])
            else:
                count_wrong += 1
                wrong_preds.append((entry[0], entry[2]))

print(f'Total Examples: {count_wrong + count_correct}')
print(f'Correct: {count_correct}, Incorrect: {count_wrong}')
print(f'Percentage Correct: {np.round( ((count_correct / (count_correct+ count_wrong) ) * 100), 3)}%')