In [None]:
# Reload local modules if they've changed.
%load_ext autoreload
%autoreload 2

In [6]:
import numpy as np
import pandas as pd

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data.read_parallel import read_parallel_local

In [8]:
from src.models.deeplegis import legislationDataset

In [16]:
REDUCE_BY_FACTOR = 1000 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol/'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp
df = df.reset_index(drop=True)

Original number of examples: 199451
Reduced number of examples:  199
Took 0.047682785987854005 min to open 199 files with 20 processes.


In [26]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 32
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90
print(config)
legis_builder = legislationDataset(config)
train_stream, val_stream, test_stream = legis_builder.create_batch_stream(df)
for elem in train_stream.take(1):
    print(elem)

{'max_length': 128, 'train_batch_size': 32, 'testing': False, 'train_test_ratio': 0.91, 'train_valid_ratio': 0.9}
Training size: (162, 7)
Validation size: (19, 7)
Test size: (18, 7)


ValueError: too many values to unpack (expected 2)

In [85]:
df_train_full, df_test = train_test_split(df, train_size = train_test_ratio, random_state = 1, stratify = df.signed.values)
df_train, df_valid = train_test_split(df_train_full, train_size = train_valid_ratio, random_state = 1, stratify = df_train_full.signed.values)
print(f"Training size: {df_train.shape}")
print(f"Validation size: {df_valid.shape}")
print(f"Test size: {df_test.shape}")

Training size: (32669, 7)
Validation size: (3630, 7)
Test size: (3591, 7)


In [113]:
train_data1 = tf.data.Dataset.from_tensor_slices((df_train['text'].values, df_train['signed'].values, df_train['partisan_lean'].values))
val_data1   = tf.data.Dataset.from_tensor_slices((df_valid['text'].values, df_valid['signed'].values, df_valid['partisan_lean'].values))
test_data1  = tf.data.Dataset.from_tensor_slices((df_test['text'].values, df_test['signed'].values, df_test['partisan_lean'].values))
for text, label, pl in train_data1.take(1):
    print(text)
    print(label)
    print(pl)

tf.Tensor(b' new york state assembly speaker carl e. heastie]    javascript must be enabled to properly view this page.  bill no.: actions committee&nbspvotes floor&nbspvotes memo lfin chamber&nbspvideo/transcript a02591 summary: bill no a02591 &nbsp same as no same as &nbsp sponsor crouch &nbsp cospnsr barclay, finch, giglio, hawley, raia, manktelow &nbsp mltspnsr &nbsp add 1285-v, amd pub auth l &nbsp creates the concentrated animal feeding operation environmental compliance revolving loan program.-a02591 text: state of new york 2019-2020 regular sessions in assembly january 2019 introduced by m. of a. crouch, barclay, finch, giglio, hawley, raia read once and referred to the committee on agriculture an act to amend the public authorities law, in relation to creating the concentrated animal feeding operation environmental compliance revolv ing loan program the people of the state of new york, represented in senate and assem bly, do enact as follows: . the public authorities law is am

In [None]:
df_train.map()

In [119]:
lf_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [152]:
label_list = [0,1]
max_length = 128
def to_feature2(text, label, partisan_lean, label_list=label_list, max_length=max_length, tokenizer=lf_tokenizer):
  
    output = tokenizer(text.numpy().decode('ascii'), return_tensors="tf", truncation=True, padding='max_length', max_length=max_length)
    
    return (tf.squeeze(output['input_ids'],0), tf.cast(label, 'int32'), tf.cast(partisan_lean, 'float32'))

def to_feature_map2(text, label, partisan_lean):
    input_ids, label_id, partisan_lean  \
       = tf.py_function(to_feature2, [text, label, partisan_lean], Tout = [tf.int32, tf.int32, tf.float32])
    
    input_ids.set_shape([max_length])
    label_id.set_shape([])
    partisan_lean.set_shape([])
    
    x = {
        'input_ids': input_ids,
        'partisan_lean': partisan_lean
    }
    
    return (x, label_id)

In [153]:
train_batch_size = 32
#with tf.device('/cpu:0'):
train_data = (train_data1.map(to_feature_map2, num_parallel_calls=tf.data.experimental.AUTOTUNE)
             
             .shuffle(1000)
             .batch(train_batch_size, drop_remainder=True)
             .prefetch(tf.data.experimental.AUTOTUNE) 
             )

val_data = (val_data1.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
             .batch(train_batch_size, drop_remainder=True)
             .prefetch(tf.data.experimental.AUTOTUNE) 
            )

In [154]:
#train_data.take(1)
for elem in train_data.take(1):
    print(elem)

({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[    0, 19707,  5375, ...,     9,    10,     2],
       [    0, 37959,   288, ...,    74, 26845,     2],
       [    0,    92,  1423, ...,  9657,   194,     2],
       ...,
       [    0,  2362,   455, ...,     1,     1,     1],
       [    0,   200,  1675, ...,   910,  9426,     2],
       [    0, 24837,  1852, ...,  2342,   368,     2]], dtype=int32)>, 'partisan_lean': <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.3443472 , 0.6289722 , 0.7085218 , 0.2651231 , 0.595405  ,
       0.48706314, 0.36427715, 0.479158  , 0.6       , 0.58278143,
       0.40122998, 0.42693365, 0.479158  , 0.42966998, 0.23762377,
       0.7381234 , 0.42693365, 0.7085218 , 0.23611817, 0.23611817,
       0.5386392 , 0.29411766, 0.23762377, 0.42942113, 0.48706314,
       0.7085218 , 0.23076923, 0.23611817, 0.3826962 , 0.7085218 ,
       0.29447854, 0.55      ], dtype=float32)>}, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0

In [90]:
import tensorflow as tf
from transformers import TFLongformerModel, TFLongformerForSequenceClassification
from transformers import LongformerTokenizer

In [55]:
model = TFLongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [169]:
# Docs for tokenizer call:
# https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer
temp = tokenizer("Hello, my dog is cute", return_tensors="tf", padding='max_length', max_length=max_length)
inputs ={}
inputs['input_ids'] = temp['input_ids']
#inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
inputs["partisan_lean"] =tf.expand_dims(tf.convert_to_tensor(0.60, dtype='float32', name='partisan_lean'), axis=0)
print(inputs.keys())

dict_keys(['input_ids', 'partisan_lean'])


In [170]:
inputs['partisan_lean']

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.6], dtype=float32)>

In [171]:
ids = tf.keras.Input((max_length), dtype=tf.int32, name='input_ids')
pl = tf.keras.Input((1, ), dtype=tf.float32, name='partisan_lean')
x = model.longformer(ids)
x = x['last_hidden_state'][:,0,:]
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.concat([x, pl], axis=-1)
x = tf.keras.layers.Dense(700, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
dl_model = tf.keras.Model(inputs={"input_ids":ids,"partisan_lean":pl}, outputs=[x])

print(dl_model.summary())
a  = dl_model(inputs)


Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay multiple             148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_11 (Sl (None, 768)          0           longformer[16][0]                
__________________________________________________________________________________________________
dropout_709 (Dropout)           (None, 768)          0           tf.__operators__.getitem_11[0][0]
___________________________________________________________________________________________

In [172]:
dl_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
          loss = tf.keras.losses.BinaryCrossentropy(),
          metrics = [tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = dl_model.fit(train_data,
                      validation_data=val_data,
                      epochs=1,
                      verbose=1)

TensorShape([1, 32, 768])

In [28]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.models.deeplegis import *    


Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing DeepLegis: ['lm_head']
- This IS expected if you are initializing DeepLegis from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeepLegis from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of DeepLegis were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
dl = DeepLegis.from_pretrained("allenai/longformer-base-4096")
outputs = dl(inputs)
print(inputs.keys())
print(outputs.keys())

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing DeepLegis: ['lm_head']
- This IS expected if you are initializing DeepLegis from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeepLegis from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of DeepLegis were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dict_keys(['input_ids', 'attention_mask', 'labels', 'partisan_lean'])
odict_keys(['loss', 'logits'])
