In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
from src.data.read_parallel import read_parallel_local
import matplotlib.pyplot as plt

In [12]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/datavol/'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 334
Reduced number of examples:  334
Took 0.006455898284912109 min to open 334 files with 20 processes.


In [2]:
from src.models.data_loader import *
from transformers import LongformerTokenizer

In [40]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 4
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = "todo"
config['log_dir'] = "todo"
                                
#a = legislationDatasetPartisanLean(config)

In [41]:
import tensorflow as tf

In [42]:
from src.models.deeplegis import *

In [43]:
b = deepLegisAll(config)

In [44]:
b.load_data(df)

Training size: (272, 8)
Validation size: (31, 8)
Test size: (31, 8)


In [46]:
for elem in b.train_batches.take(1):
    print(elem)

({'input_ids': <tf.Tensor: shape=(4, 128), dtype=int32, numpy=
array([[    0,    11,     5,    76,     9,    84, 30722,    80,  7673,
        40126,    41,  1760,    28,    24, 14673,    30,     5, 22437,
            8,   790,     9,  4844,    11,   937,   461, 24228,    35,
          564,    35,   134, 14423,     8,  2270,  8555,  1676,   131,
         6332,     4,   910, 11146, 18461,    12,   119,    35,   246,
            6,   939,    16, 29643,     8,   769,   225, 29179,     7,
         1166,    25,  3905,    35,   411,   453,  3873,    30,     5,
         2318,     6,     9,  2661,  5658,    28,  4844,     9,   284,
         5799,  5228,    31, 34383, 11693,   911,     6,    65,     9,
         2661,  5658,    28,    10,  1131,  3696,     6,     8,    65,
            9,  2661,  5658,    28,    10,  4095,    50,  2267,     4,
           65,   919,     9,     5,   790,     9,  4844,     6,  3873,
           30,     5,  5385,     9,     5,   790,     9,  4844,     4,
           65,

In [34]:
b.build()

In [35]:
b.deep_legis_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
longformer (TFLongformerMainLay TFLongformerBaseMode 148068864   input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None, 768)          0           longformer[3][0]                 
__________________________________________________________________________________________________
dropout_202 (Dropout)           (None, 768)          0           tf.__operators__.getitem_1[0][0] 
____________________________________________________________________________________________