In [1]:
%load_ext autoreload
%autoreload 2

In [64]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
from src.data.read_parallel import read_parallel_local
import matplotlib.pyplot as plt
from src.models.deeplegis import *
from src.models.data_loader import *
from transformers import LongformerTokenizer

In [12]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/datavol/'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 334
Reduced number of examples:  334
Took 0.006455898284912109 min to open 334 files with 20 processes.


In [76]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 4
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = "/data/models/no_text.ckpt"
config['log_dir'] = "/data/logs/"
config['epochs'] = 2
                                
#a = legislationDatasetPartisanLean(config)

In [83]:
b = deepLegisAll(config)

In [84]:
b.load_data(df)

Training size: (272, 8)
Validation size: (31, 8)
Test size: (31, 8)


In [85]:
for elem in b.train_batches.take(1):
    print(elem)

({'input_ids': <tf.Tensor: shape=(4, 128), dtype=int32, numpy=
array([[    0,    11,     5,    76,     9,    84, 30722,    80,  7673,
        40126,    41,  1760,    28,    24, 14673,    30,     5, 22437,
            8,   790,     9,  4844,    11,   937,   461, 24228,    35,
        35651,    35,   134, 12418, 10765,   131, 26613,     4, 19338,
          910, 11146, 25758,    35,  2518,     7,  1166,    25,  3905,
           35, 25758,    35,  2518, 26613,     4,    11,    42, 28764,
           35,   939,     4,    22,  4929, 17809,   839,     5, 12418,
        10765,   792,  2885,    11,   910, 11146, 25758,    35,  2518,
           12,   102,     4, 42661,     4,    22, 12623, 23862, 30170,
        23538, 17809,   839,   143,   621,  4009,    11,     5,  8809,
            6, 21529,     6,     8,  5989,     9, 23321,  4550,  2550,
         8462,  1728,    50,  1632,  1123,  1897, 14636,  1887,  4010,
           13,  4860,   304,     6,  1804,     7,  4860,  5418,  3841,
          268,

In [86]:
b.build()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFLongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
b.deep_legis_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay TFLongformerBaseMode 148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_2 (Sli (None, 768)          0           longformer[0][0]                 
__________________________________________________________________________________________________
dropout_308 (Dropout)           (None, 768)          0           tf.__operators__.getitem_2[0][0] 
____________________________________________________________________________________________

In [82]:
b.train()

Epoch 1/2

Epoch 00001: saving model to /data/models/no_text.ckpt
Epoch 2/2

Epoch 00002: saving model to /data/models/no_text.ckpt


<tensorflow.python.keras.callbacks.History at 0x7f170af0d2b0>