### Cleaning data associated with bills: utterances, summaries; so they are ready for input to pointer-gen model - this is the new cleaning method implementation

There are 6541 BIDs which overlap between the utterances and summaries datasets (using all the summary data). There are 359 instances in which the summaries are greater than 100 tokens in length, and 41 instances in which the summaries are greater than 201 tokens in length. In these instances, the summaries with less than 201 tokens were cut to their first 100 tokens (anything over 201 tokens is cut entirely). There are 374 instances in which the utterances are less than 70 tokens in length. In the final dataset(old) of 6000 examples, there are 865 examples of resolutions.

There are 374+127=501 instances in which the utterances are less than 100 tokens in length.

In [1]:
import json
import numpy as np
import ast
import re
import spacy
from collections import Counter,defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open("../data/bill_summaries.json") as summaries_file: # loading in the data
    bill_summaries = json.load(summaries_file)
    
with open("../data/bill_utterances.json") as utterances_file:
    bill_utterances = json.load(utterances_file)
    
ca_bill_utterances = bill_utterances['CA']

### Cleaning data before the processing to format which is accepted by pointer-gen model

In [4]:
def clean_bill_summaries(bill_summaries,max_summary_length=201,ignore_resolutions=False):
    """ post-processing to remove bill summary entries with certain critera:
          1) if the summary does not start with "This" (probable encoding error)
          2) if "page 1" occurs in the text (indicates improper encoding)
          3) if the text is over max_summary_length tokens in length (very long summaries indicate probable encoding error)
        -for bill summaries which have ordering (" 1)"," 2)","(1)","(2)"," a)","(a)"), removes the implicit ordering 
    
    args:
        summary_cutoff: the length of the summary for the text in which to keep
        max_summary_length: max length of summaries in which to keep
        ignore_resolutions (bool): whether to ignore resolutions and only output bills
    """
    num_cutoff_counter=0 # counts the number of summaries ignored due to being too long
    bill_summary_info = defaultdict(dict) # stores both summaries and utterances for each CA bill
    for bid,summary in bill_summaries.items():
        text = summary['text']
        
        if "page 1" in text: # ignore this instance, indicator of encoding error
            continue
        if text[0:4] != "This": # relatively strong indicator that there was error in encoding
            continue
        if ignore_resolutions and "R" in bid: # ignore this instance if wanting to ignore resolutions
            continue
            
        tokens = [str(token) for token in nlp(text)] 
        if len(tokens)>max_summary_length: # ignore this instance, includes many errors in pdf encoding in which end state not reached
            num_cutoff_counter += 1
            continue
        # removing the implicit ordering for all instances
        if " 1)" in text or " 2)" in text or "(1)" in text or "(2)" in text or " a)" in text or " b)" in text or "(a)" in text or "(b)" in text:
            text = re.sub(" \([0-9]\)","",text)
            text = re.sub(" [0-9]\)","",text)
            text = re.sub(" \([a-j]\)","",text)
            text = re.sub(" [a-j]\)","",text)
            tokens = [str(token) for token in nlp(text)]
        
        bill_summary_info[bid]['summary'] = summary
        bill_summary_info[bid]['summary']['text']=text # text is occasionally updated (when ordering removed)
        bill_summary_info[bid]['summary_tokens'] = tokens

    return bill_summary_info,num_cutoff_counter

In [5]:
bill_summary_info,_ = clean_bill_summaries(bill_summaries,max_summary_length=650,ignore_resolutions=False)
len(bill_summary_info)

6897

In [6]:
def clean_bill_utterances(bill_summary_info,ca_bill_utterances,minimum_utterance_tokens=99,token_cutoff=1000,max_utterances=50):
    """ cleans and combines the summary and utterance data
        -also captures info about the hierarchical structure of utterances
    args:
        bill_summary_info: holds cleaned information about bill summaries
        token_cutoff: max number of tokens to consider for utterances
        minimum_utterance_tokens: minimum number of utterance tokens allowable
        max_utterances: maximum number of of individual utterances
    """
    num_utterance_counter=0 # counts num. examples ignored due to utterances being too short
    all_bill_info = {}
    all_tokens_dict = {} # stores all tokens for a given bid (ignoring token_cutoff)
    
    for bid in ca_bill_utterances:
        if bid in bill_summary_info: # there is a summary assigned to this bill

            all_utterances = [] # combining all discussions (did) for this bid together
            for utterance_list in ca_bill_utterances[bid]['utterances']:
                all_utterances+=utterance_list
            
            all_token_lists = [[str(token) for token in nlp(utterance)] for utterance in all_utterances]
            utterance_lengths = [] # tracks number of tokens in each utterance
            utterance_indices = [] # tracks index of last word in each utterance

            all_tokens = [] # getting a single stream of tokens
            cumulative_token_count=0 # used to track index of current last word in utterance
            for token_list in all_token_lists:
                if cumulative_token_count < token_cutoff: # ensures only utterances up to token_cutoff are considered
                    utterance_lengths.append(min(len(token_list),token_cutoff-cumulative_token_count))
                    utterance_indices.append(min(cumulative_token_count+len(token_list)-1,token_cutoff-1))
                    cumulative_token_count += len(token_list)
                all_tokens += token_list
            
            utterance_padding_num = max_utterances-len(utterance_lengths)
            utterance_att_mask = [0 for _ in range(len(utterance_lengths))]+[-np.inf for _ in range(utterance_padding_num)]
            utterance_lengths += [0 for _ in range(utterance_padding_num)]
            utterance_indices += [0 for _ in range(utterance_padding_num)]
            if cumulative_token_count < token_cutoff: # required padding if num. tokens is < token_cutoff
                utterance_lengths[-1]=token_cutoff-cumulative_token_count
                
            if len(all_tokens)-len(all_token_lists)>=minimum_utterance_tokens: # ignore bids which don't have enough utterance tokens
                all_tokens_dict[bid]=[token.lower() for token in all_tokens] # adding all utterance tokens
                all_tokens_dict[bid]+=[token.lower() for token in bill_summary_info[bid]['summary_tokens']] # adding all summary tokens
                all_bill_info[bid] = bill_summary_info[bid]
                all_tokens = all_tokens[:token_cutoff] # taking up to max number of tokens
                all_bill_info[bid]['utterance_att_mask']=utterance_att_mask # hierarchical input information
                all_bill_info[bid]['utterance_indices']=utterance_indices
                all_bill_info[bid]['utterance_lengths']=utterance_lengths
                all_bill_info[bid]['utterances']=all_utterances # standard input information
                all_bill_info[bid]['utterance_tokens']=all_tokens
                all_bill_info[bid]['resolution'] = "R" in bid
            else:
                num_utterance_counter += 1

    return all_bill_info,all_tokens_dict,num_utterance_counter

In [7]:
all_bill_info,all_tokens_dict,_ = clean_bill_utterances(bill_summary_info,ca_bill_utterances,token_cutoff=500)
len(all_bill_info)

5900

### Processing data to get to format which is accepted by pointer-gen model

In [8]:
### using pretrained Glove vectors
word_to_embedding = {}
with open("../data/glove.6B/glove.6B.100d.txt") as glove_file:
    for line in glove_file.readlines():
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')
        word_to_embedding[word] = coefs
print(len(word_to_embedding))

400000


In [9]:
# getting all unique tokens used to get words which will be part of the fixed vocabulary
## specifically specifying that I want a vocabulary size of 30k (adding less common words up to that threshold)
all_tokens = []
for bid in all_tokens_dict:
    all_tokens += all_tokens_dict[bid]

word_freq = Counter(all_tokens)
words_by_freq = (list(word_freq.items()))
words_by_freq.sort(key=lambda tup: tup[1],reverse=True) # sorting by occurance freq.

most_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] >= 3]
most_freq_words += [word_tup[0] for word_tup in words_by_freq if word_tup[1] == 2 and word_tup[0] in word_to_embedding][:30000-3-len(most_freq_words)]
less_freq_words = [word_tup[0] for word_tup in words_by_freq if word_tup[1] < 2]
print(most_freq_words[0:10])
print(less_freq_words[0:10])
print(len(most_freq_words),len(less_freq_words))

['.', ',', 'the', 'to', 'and', 'of', 'that', 'a', 'in', 'i']
['ab1', '00', 'explication', 'fpcc', 'abrasively', 'mariella', 'shantz', 'hemmings', 'segerblom', 'utla']
29997 17417


In [10]:
## new addition to this where I store the word embeddings for the vocabulary
# assigning indices for all words, and adding <PAD>,<SENT>,<UNK> symbols
fixed_vocab_word_to_index = {"<PAD>":0,"<SENT>":1,"<UNK>":2} # for words assigned to the fixed_vocabulary
fixed_vocab_index_to_word = {0:"<PAD>",1:"<SENT>",2:"<UNK>"}

word_embeddings = [np.random.uniform(low=-0.05,high=0.05,size=100).astype("float32") for _ in range(3)]

index = 3 # starting index for all words
# assigning indices to most common words:
for word in most_freq_words:
    fixed_vocab_word_to_index[word]=index
    fixed_vocab_index_to_word[index]=word
    index += 1
    if word in word_to_embedding: # use pre-trained embedding
        word_embeddings.append(word_to_embedding[word])
    else: # initialize a trainable embedding
        word_embeddings.append(np.random.uniform(low=-0.05,high=0.05,size=100).astype("float32"))

word_embeddings = np.stack(word_embeddings)        
print(len(fixed_vocab_word_to_index),word_embeddings.shape)

30000 (30000, 100)


In [11]:
## saving all of the vocabulary related information
np.save("../data/len_500_data/word_embeddings.npy",word_embeddings)

with open("../data/len_500_data/word_to_index.json","w+") as out_file:
    json.dump(fixed_vocab_word_to_index,out_file)
    
with open("../data/len_500_data/index_to_word.json","w+") as out_file:
    json.dump(fixed_vocab_index_to_word,out_file)

In [12]:
num_fixed_words = len(fixed_vocab_word_to_index)

In [13]:
token_cutoff=500 # this is the amount to pad up to for the input representation

In [14]:
# creating the input data representations for the model - input is padded up to a length of 500
x = [] # stores the integer/index representation for all input
x_indices = [] # stores the joint probability vector indices for all words in the input 
x_indices_dicts = [] # stores the dicts for assigning words which are not in the fixed_vocabulary
att_mask = [] # stores the attention masks (0 for valid words, -np.inf for padding)
utterance_att_masks = [] # hierarchical information
utterance_indices = []
utterance_lengths = []

## data stores for debugging/error analysis
bill_information_dict = {} # stores summary(text),utterances(2d list of utterances),resolution(boolean) for each BID
bids = [] # stores the BIDs in sequential order

for bid in all_bill_info:
    # creating representations for data store
    bill_information_dict[bid] = {"summary":all_bill_info[bid]["summary"]["text"],"utterances":all_bill_info[bid]["utterances"],"resolution":all_bill_info[bid]["resolution"]}
    bids.append(bid)
    
    # saving hierarchical information:
    utterance_att_masks.append(all_bill_info[bid]['utterance_att_mask'])
    utterance_indices.append(all_bill_info[bid]['utterance_indices'])
    utterance_lengths.append(all_bill_info[bid]['utterance_lengths'])
    
    # creating the standard input representation:
    utterance_tokens = [token.lower() for token in all_bill_info[bid]["utterance_tokens"]]
    
    x_rep = [] # assigning indices to words, if input word not part of fixed_vocab, assign to <UNK>
    for token in utterance_tokens:
        if token in fixed_vocab_word_to_index:
            x_rep.append(fixed_vocab_word_to_index[token])
        else:
            x_rep.append(fixed_vocab_word_to_index['<UNK>'])

    att_mask_rep = [0 for i in range(len(x_rep))]
    amount_to_pad = token_cutoff-len(x_rep)
    x_rep += [0 for i in range(amount_to_pad)] # padding the input
    att_mask_rep += [-np.inf for i in range(amount_to_pad)]
    x.append(x_rep)
    att_mask.append(att_mask_rep)
    
    # creating the joint probability representation for the input:
    ## (the index in joint prob vector that each input word probability should be assigned to)
    index=num_fixed_words # start index for assignment to joint_probability vector, length of fixed_vocab_size
    non_vocab_dict = {} # stores all OOV words for this bid
    this_x_indices = [] # joint prob vector indices for this bid
    for token in utterance_tokens:
        if token in fixed_vocab_word_to_index:
            this_x_indices.append(fixed_vocab_word_to_index[token])
        else:
            if token in non_vocab_dict: # this word is OOV but has been seen before
                this_x_indices.append(non_vocab_dict[token])
            else: # this word is OOV and has never been seen before
                non_vocab_dict[token]=index
                this_x_indices.append(index)
                index += 1
    x_indices_dicts.append(non_vocab_dict)
    this_x_indices += [0 for i in range(amount_to_pad)] # padding will be masked out in att calculation, so padding with 0 here is valid
    x_indices.append(this_x_indices)

In [15]:
# this is the largest number of OOV words for a given bid utterances
max([len(dic) for dic in x_indices_dicts])

18

In [16]:
# creating the output representations for the model - output is padded up to a length of 101
## the last index is for <SENT> to indicate the end of decoding (assuming representation is shorter than 100 tokens)
## assuming the summary is greater than 100 tokens in length, we simply cut off the first 101 tokens
### when we do this cutoff, we do NOT include that <SENT> token as the 102nd token
## all words in output that are not in input utterances or in fixed_vocab_vector are assigned 3:<UNK>
y = [] # stores the index representations for all words in the headlines (this is never used)
loss_mask = [] # 1 for valid words, 0 for padding
decoder_x = [] # starts with 1:<SENT>, followed by y[0:len(headline)-1] (this is the input for teacher-forcing)(101x1)
y_indices = [] # index for the correct decoder prediction, in the joint-probability vector

total_oov_words = 0
resolution_bools = [] # bool, whether a given example is a resolution (False=bill); used for train_test_split

for bid_i,bid in enumerate(all_bill_info.keys()):
    # creating standard output representation:
    summary_tokens = [token.lower() for token in all_bill_info[bid]["summary_tokens"]]
    
    y_rep = [] # not used in the model, stores indices using only fixed_vocab_vector
    for token in summary_tokens:
        if token in fixed_vocab_word_to_index:
            y_rep.append(fixed_vocab_word_to_index[token])
        else:
            y_rep.append(fixed_vocab_word_to_index['<UNK>'])
           
    resolution_bools.append(all_bill_info[bid]['resolution'])
    
    ## this is a new addition from before, including longer summaries, but just cutting off the text
    if len(y_rep) > 100: # simply cutoff to the first 101 tokens
        y_rep = y_rep[:101]
    else: # append a end-of-sentence indicator
        y_rep.append(fixed_vocab_word_to_index['<SENT>'])
    
    loss_mask_rep = [1 for i in range(len(y_rep))]
    decoder_x_rep = [1]+y_rep[0:len(y_rep)-1] # embedding word in input but not in fixed_vocab is currently set to <UNK>
    amount_to_pad = 101-len(y_rep) # 100+1 represents final <SENT> prediction
    y_rep += [0 for i in range(amount_to_pad)]
    loss_mask_rep += [0 for i in range(amount_to_pad)] # cancels out loss contribution from padding
    decoder_x_rep += [0 for i in range(amount_to_pad)]
    
    # creating joint-probability representation of output:
    non_vocab_dict = x_indices_dicts[bid_i]
    y_indices_rep = []
    for token in summary_tokens:
        if token in fixed_vocab_word_to_index: # word is in fixed_vocabulary
            y_indices_rep.append(fixed_vocab_word_to_index[token])
        elif token in non_vocab_dict: # word is OOV but in the input utterances, use the index assigned to this word in x_indices
            y_indices_rep.append(non_vocab_dict[token])
        else: # word is OOV and not in input utterances
            y_indices_rep.append(fixed_vocab_word_to_index["<UNK>"])
            total_oov_words += 1
            
    if len(y_indices_rep) > 100: # simply cutoff to the first 101 tokens
        y_indices_rep = y_indices_rep[:101]
    else: # if len <= 100, last prediction should be <SENT>
        y_indices_rep.append(fixed_vocab_word_to_index['<SENT>'])
    
    y_indices_rep += [0 for i in range(amount_to_pad)] # padding will be ignored by loss_mask
    y.append(y_rep)
    loss_mask.append(loss_mask_rep)
    decoder_x.append(decoder_x_rep)
    y_indices.append(y_indices_rep)

In [17]:
x = np.array(x).astype("int32")
x_indices = np.array(x_indices).astype("int32")
att_mask = np.array(att_mask).astype("float32")
loss_mask = np.array(loss_mask).astype("float32")
decoder_x = np.array(decoder_x).astype("int32")
y_indices = np.array(y_indices).astype("int32")
utterance_att_masks = np.array(utterance_att_masks).astype("float32")
utterance_indices = np.array(utterance_indices).astype("int32")
utterance_lengths = np.array(utterance_lengths).astype("int32")
print(x.shape,x_indices.shape,att_mask.shape) 
print(loss_mask.shape,decoder_x.shape,y_indices.shape)
print(utterance_att_masks.shape,utterance_indices.shape,utterance_lengths.shape)

bids = np.array(bids)
print(bids.shape,len(bill_information_dict))

(5900, 500) (5900, 500) (5900, 500)
(5900, 101) (5900, 101) (5900, 101)
(5900, 50) (5900, 50) (5900, 50)
(5900,) 5900


#### Shuffling the data so that only bills are in the validation and test sets

In [18]:
from sklearn.utils import shuffle

In [19]:
x_resolution = x[resolution_bools]
x_indices_resolution = x_indices[resolution_bools]
att_mask_resolution = att_mask[resolution_bools]
loss_mask_resolution = loss_mask[resolution_bools]
decoder_x_resolution = decoder_x[resolution_bools]
y_indices_resolution = y_indices[resolution_bools]
bids_resolution = bids[resolution_bools]
utterance_att_masks_resolution = utterance_att_masks[resolution_bools]
utterance_indices_resolution = utterance_indices[resolution_bools]
utterance_lengths_resolution = utterance_lengths[resolution_bools]

bill_bools = [not res_bool for res_bool in resolution_bools] # reversal
x_bill = x[bill_bools]
x_indices_bill = x_indices[bill_bools]
att_mask_bill = att_mask[bill_bools]
loss_mask_bill = loss_mask[bill_bools]
decoder_x_bill = decoder_x[bill_bools]
y_indices_bill = y_indices[bill_bools]
bids_bill = bids[bill_bools]
utterance_att_masks_bill = utterance_att_masks[bill_bools]
utterance_indices_bill = utterance_indices[bill_bools]
utterance_lengths_bill = utterance_lengths[bill_bools]

print(x_resolution.shape,loss_mask_resolution.shape,bids_resolution.shape,utterance_att_masks_resolution.shape)
print(x_bill.shape,loss_mask_bill.shape,bids_bill.shape,utterance_att_masks_bill.shape)

(856, 500) (856, 101) (856,) (856, 50)
(5044, 500) (5044, 101) (5044,) (5044, 50)


In [20]:
# shuffling only the bill data - in order to get the validation and val set data
x_bill,x_indices_bill,att_mask_bill,loss_mask_bill,decoder_x_bill,y_indices_bill,bids_bill,utterance_att_masks_bill,utterance_indices_bill,utterance_lengths_bill = shuffle(x_bill,x_indices_bill,att_mask_bill,loss_mask_bill,decoder_x_bill,y_indices_bill,bids_bill,utterance_att_masks_bill,utterance_indices_bill,utterance_lengths_bill,random_state=1)
x_bill_val,x_indices_bill_val,att_mask_bill_val,loss_mask_bill_val,decoder_x_bill_val,y_indices_bill_val,bids_bill_val,utterance_att_masks_bill_val,utterance_indices_bill_val,utterance_lengths_bill_val = x_bill[:400],x_indices_bill[:400],att_mask_bill[:400],loss_mask_bill[:400],decoder_x_bill[:400],y_indices_bill[:400],bids_bill[:400],utterance_att_masks_bill[:400],utterance_indices_bill[:400],utterance_lengths_bill[:400]
x_bill_train,x_indices_bill_train,att_mask_bill_train,loss_mask_bill_train,decoder_x_bill_train,y_indices_bill_train,bids_bill_train,utterance_att_masks_bill_train,utterance_indices_bill_train,utterance_lengths_bill_train = x_bill[400:],x_indices_bill[400:],att_mask_bill[400:],loss_mask_bill[400:],decoder_x_bill[400:],y_indices_bill[400:],bids_bill[400:],utterance_att_masks_bill[400:],utterance_indices_bill[400:],utterance_lengths_bill[400:]
print(x_bill_val.shape,loss_mask_bill_val.shape,bids_bill_val.shape,utterance_att_masks_bill_val.shape)
print(x_bill_train.shape,loss_mask_bill_train.shape,bids_bill_train.shape,utterance_att_masks_bill_train.shape)

(400, 500) (400, 101) (400,) (400, 50)
(4644, 500) (4644, 101) (4644,) (4644, 50)


In [21]:
## to remove resolutions, simply don't include them here
# shuffling the training set - which is a combination of bill and resolution data
x_train = np.vstack([x_bill_train,x_resolution])
x_indices_train = np.vstack([x_indices_bill_train,x_indices_resolution])
att_mask_train = np.vstack([att_mask_bill_train,att_mask_resolution])
loss_mask_train = np.vstack([loss_mask_bill_train,loss_mask_resolution])
decoder_x_train = np.vstack([decoder_x_bill_train,decoder_x_resolution])
y_indices_train = np.vstack([y_indices_bill_train,y_indices_resolution])
bids_train = np.concatenate([bids_bill_train,bids_resolution])

utterance_att_masks_train = np.vstack([utterance_att_masks_bill_train,utterance_att_masks_resolution])
utterance_indices_train = np.vstack([utterance_indices_bill_train,utterance_indices_resolution])
utterance_lengths_train = np.vstack([utterance_lengths_bill_train,utterance_lengths_resolution])

x_train,x_indices_train,att_mask_train,loss_mask_train,decoder_x_train,y_indices_train,utterance_att_masks_train,utterance_indices_train,utterance_lengths_train = shuffle(x_train,x_indices_train,att_mask_train,loss_mask_train,decoder_x_train,y_indices_train,utterance_att_masks_train,utterance_indices_train,utterance_lengths_train,random_state=2)
print(x_train.shape,loss_mask_train.shape,bids_train.shape,utterance_att_masks_train.shape)

(5500, 500) (5500, 101) (5500,) (5500, 50)


In [22]:
# adding all the data together, with the final 400 instances being the val and test sets
x_final = np.vstack([x_train,x_bill_val])
x_indices_final = np.vstack([x_indices_train,x_indices_bill_val])
att_mask_final = np.vstack([att_mask_train,att_mask_bill_val])
loss_mask_final = np.vstack([loss_mask_train,loss_mask_bill_val])
decoder_x_final = np.vstack([decoder_x_train,decoder_x_bill_val])
y_indices_final = np.vstack([y_indices_train,y_indices_bill_val])
bids_final = np.concatenate([bids_train,bids_bill_val])

utterance_att_masks_final = np.vstack([utterance_att_masks_train,utterance_att_masks_bill_val])
utterance_indices_final = np.vstack([utterance_indices_train,utterance_indices_bill_val])
utterance_lengths_final = np.vstack([utterance_lengths_train,utterance_lengths_bill_val])

print(x_final.shape,loss_mask_final.shape,bids_final.shape,utterance_att_masks_final.shape)

(5900, 500) (5900, 101) (5900,) (5900, 50)


In [23]:
## there is no final shuffling, as the last 400 datapoints represent the validation/test sets
subdir = "len_500_data"
np.save("../data/{}/x_500.npy".format(subdir),x_final)
np.save("../data/{}/x_indices_500.npy".format(subdir),x_indices_final)
np.save("../data/{}/att_mask_500.npy".format(subdir),att_mask_final)
np.save("../data/{}/loss_mask_500.npy".format(subdir),loss_mask_final)
np.save("../data/{}/decoder_x_500.npy".format(subdir),decoder_x_final)
np.save("../data/{}/y_indices_500.npy".format(subdir),y_indices_final)
np.save("../data/{}/bids_500.npy".format(subdir),bids_final)

np.save("../data/{}/utterance_att_mask_500.npy".format(subdir),utterance_att_masks_final)
np.save("../data/{}/utterance_indices_500.npy".format(subdir),utterance_indices_final)
np.save("../data/{}/utterance_lengths_500.npy".format(subdir),utterance_lengths_final)

with open("../data/len_500_data/bill_information.json","w+") as out_file:
    json.dump(bill_information_dict,out_file)