In [None]:
# Reload local modules if they've changed.
%load_ext autoreload
%autoreload 2

In [53]:
import numpy as np
import pandas as pd

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
import os
from sklearn.preprocessing import LabelEncoder

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data.read_parallel import read_parallel_local

In [8]:
from src.models.deeplegis import legislationDataset

In [54]:
REDUCE_BY_FACTOR = 1000 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol/'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 199451
Reduced number of examples:  199
Took 0.01049038569132487 min to open 199 files with 20 processes.


In [55]:
df.head()

Unnamed: 0,bill_id,version_number,id,partisan_lean,sc_id,signed,text,sc_id_cat
0,1296772,1,2533454,0.691329,641-2,0,"assembly, no. state of new jersey 219th legisl...",68
1,1244982,1,2434261,0.398339,622-1,0,florida senate sb by senator baxley 2020308 a ...,52
2,1204463,2,2320565,0.538639,587-2,0,i 116th congress 1st session to require a re...,25
3,1274032,1,2490834,0.691329,641-2,0,"assembly, no. state of new jersey 219th legisl...",68
4,1151119,1,2199556,0.416977,609-1,0,enter search terms introduced version | se...,44


In [39]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 32
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90
print(config)
legis_builder = legislationDataset(config)
train_stream, val_stream, test_stream = legis_builder.create_batch_stream(df)
for elem in train_stream.take(1):
    print(elem)

{'max_length': 128, 'train_batch_size': 32, 'testing': False, 'train_test_ratio': 0.91, 'train_valid_ratio': 0.9}
Training size: (162, 7)
Validation size: (19, 7)
Test size: (18, 7)
({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[    0,    30,    35, ..., 47252,   740,     2],
       [    0,  9251,     9, ...,     6,    11,     2],
       [    0, 24079,    18, ..., 44347, 28791,     2],
       ...,
       [    0, 44979,     6, ...,    13,  1680,     2],
       [    0,  3138,  1087, ...,    10,  4874,     2],
       [    0,   200,  1675, ...,  2052,     7,     2]], dtype=int32)>, 'partisan_lean': <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.39683497, 0.20165333, 0.42942113, 0.6913286 , 0.32524636,
       0.36973906, 0.41637418, 0.2714404 , 0.29621407, 0.38731214,
       0.7381234 , 0.32524636, 0.38731214, 0.3826962 , 0.59868157,
       0.65328044, 0.32524636, 0.6       , 0.5386392 , 0.39683497,
       0.7381234 , 0.62637186, 0.5386392 , 0.6289722 , 0.6

In [43]:
from src.models.deeplegis import legislationDatasetText
legis_builder = legislationDatasetText(config)
train_stream, val_stream, test_stream = legis_builder.create_batch_stream(df)
for elem in train_stream.take(1):
    print(elem)

Training size: (162, 7)
Validation size: (19, 7)
Test size: (18, 7)
({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[    0, 13011,  1594, ...,  4361,     8,     2],
       [    0, 29886,  2546, ...,    13,     8,     2],
       [    0,   134, 42957, ...,     6,    37,     2],
       ...,
       [    0,  2810,    10, ..., 14542,    16,     2],
       [    0,    11,     5, ...,  1683,    23,     2],
       [    0,    92,  1423, ..., 10435,  1623,     2]], dtype=int32)>}, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>)


In [61]:
#len(sc_id_encoder.classes_)

70

In [65]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 32
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
print(config)
from src.models.deeplegis import legislationDatasetAll
legis_builder = legislationDatasetAll(config)
train_stream, val_stream, test_stream = legis_builder.create_batch_stream(df)
for elem in train_stream.take(1):
    print(elem)

{'max_length': 128, 'train_batch_size': 32, 'testing': False, 'train_test_ratio': 0.91, 'train_valid_ratio': 0.9, 'n_sc_id_classes': 70}
Training size: (162, 8)
Validation size: (19, 8)
Test size: (18, 8)
({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[    0, 28696, 24916, ...,    42,  2810,     2],
       [    0,  2362,   455, ...,     1,     1,     1],
       [    0, 44979,     6, ...,  3072, 33799,     2],
       ...,
       [    0,  1368,   438, ...,   401, 33279,     2],
       [    0, 45927,  1270, ...,    76, 25029,     2],
       [    0,  2942,  1732, ...,  2945,  1218,     2]], dtype=int32)>, 'partisan_lean': <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.24377888, 0.7381234 , 0.63756734, 0.7085218 , 0.525     ,
       0.6913286 , 0.6913286 , 0.42966998, 0.62551236, 0.7085218 ,
       0.8953301 , 0.62637186, 0.24377888, 0.7381234 , 0.32524636,
       0.51692516, 0.2651231 , 0.7542074 , 0.65328044, 0.21333334,
       0.68410516, 0.42966998, 0.51

In [68]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 32
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
print(config)
from src.models.deeplegis import legislationDatasetRevCat
legis_builder = legislationDatasetRevCat(config)
train_stream, val_stream, test_stream = legis_builder.create_batch_stream(df)
for elem in train_stream.take(1):
    print(elem)

{'max_length': 128, 'train_batch_size': 32, 'testing': False, 'train_test_ratio': 0.91, 'train_valid_ratio': 0.9, 'n_sc_id_classes': 70}
Training size: (162, 8)
Validation size: (19, 8)
Test size: (18, 8)
({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[    0,  1437,  1437, ..., 38187,   102,     2],
       [    0,  5615,   937, ...,  4484,     9,     2],
       [    0,   627, 22437, ...,   899,     7,     2],
       ...,
       [    0,  7305,   877, ...,  4458,     6,     2],
       [    0,   194,     9, ...,     6,   109,     2],
       [    0,  2914,  1707, ...,    12, 10224,     2]], dtype=int32)>, 'version_number': <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([1., 2., 1., 2., 2., 1., 1., 2., 1., 1., 5., 1., 2., 1., 3., 2., 2.,
       1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 4., 1., 2., 1.],
      dtype=float32)>, 'sc_ids': <tf.Tensor: shape=(32, 1, 70), dtype=int32, numpy=
array([[[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]],

     

In [72]:
import tensorflow as tf
from transformers import TFLongformerModel, TFLongformerForSequenceClassification

In [78]:
from src.models.deeplegis import deep_legis_pl 
dl_pl = deep_legis_pl(config)
dl_pl.summary()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay TFLongformerBaseMode 148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_3 (Sli (None, 768)          0           longformer[0][0]                 
__________________________________________________________________________________________________
dropout_205 (Dropout)           (None, 768)          0           tf.__operators__.getitem_3[0][0] 
____________________________________________________________________________________________

In [81]:
from src.models.deeplegis import deep_legis_all
dl_all = deep_legis_all(config)
dl_all.summary()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay TFLongformerBaseMode 148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_6 (Sli (None, 768)          0           longformer[0][0]                 
__________________________________________________________________________________________________
sc_id (InputLayer)              [(None, 70)]         0                                            
____________________________________________________________________________________________

In [83]:
from src.models.deeplegis import deep_legis_text
dl_text = deep_legis_text(config)
dl_text.summary()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 128)]             0         
_________________________________________________________________
longformer (TFLongformerMain TFLongformerBaseModelOutp 148068864 
_________________________________________________________________
tf.__operators__.getitem_7 ( (None, 768)               0         
_________________________________________________________________
dropout_413 (Dropout)        (None, 768)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 700)               538300    
_________________________________________________________________
dropout_414 (Dropout)        (None, 700)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 701 

In [169]:
# Docs for tokenizer call:
# https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.PreTrainedTokenizer
temp = tokenizer("Hello, my dog is cute", return_tensors="tf", padding='max_length', max_length=max_length)
inputs ={}
inputs['input_ids'] = temp['input_ids']
#inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
inputs["partisan_lean"] =tf.expand_dims(tf.convert_to_tensor(0.60, dtype='float32', name='partisan_lean'), axis=0)
print(inputs.keys())

dict_keys(['input_ids', 'partisan_lean'])


In [170]:
inputs['partisan_lean']

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.6], dtype=float32)>

In [171]:
ids = tf.keras.Input((max_length), dtype=tf.int32, name='input_ids')
pl = tf.keras.Input((1, ), dtype=tf.float32, name='partisan_lean')
x = model.longformer(ids)
x = x['last_hidden_state'][:,0,:]
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.concat([x, pl], axis=-1)
x = tf.keras.layers.Dense(700, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
dl_model = tf.keras.Model(inputs={"input_ids":ids,"partisan_lean":pl}, outputs=[x])

print(dl_model.summary())
a  = dl_model(inputs)


Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay multiple             148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_11 (Sl (None, 768)          0           longformer[16][0]                
__________________________________________________________________________________________________
dropout_709 (Dropout)           (None, 768)          0           tf.__operators__.getitem_11[0][0]
___________________________________________________________________________________________

In [172]:
dl_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
          loss = tf.keras.losses.BinaryCrossentropy(),
          metrics = [tf.keras.metrics.BinaryAccuracy()])

In [None]:
history = dl_model.fit(train_data,
                      validation_data=val_data,
                      epochs=1,
                      verbose=1)

TensorShape([1, 32, 768])

In [28]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.models.deeplegis import *    


Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing DeepLegis: ['lm_head']
- This IS expected if you are initializing DeepLegis from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeepLegis from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of DeepLegis were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
dl = DeepLegis.from_pretrained("allenai/longformer-base-4096")
outputs = dl(inputs)
print(inputs.keys())
print(outputs.keys())

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing DeepLegis: ['lm_head']
- This IS expected if you are initializing DeepLegis from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeepLegis from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of DeepLegis were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dict_keys(['input_ids', 'attention_mask', 'labels', 'partisan_lean'])
odict_keys(['loss', 'logits'])
