In [3]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
import matplotlib.pyplot as plt

from transformers import LongformerTokenizer

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.read_parallel import read_parallel_local
from src.models.deeplegis import *
from src.models.data_loader import *

In [6]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 334
Reduced number of examples:  334
Took 0.020017449061075845 min to open 334 files with 20 processes.


In [None]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 128
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = DATA_VOL + "models/no_text/no_text2.ckpt"
config['log_dir'] = DATA_VOL + "logs/fit/no_text2_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
config['epochs'] = 20
config['learning_rate'] = 1e-3
b = deepLegisNoText(config)
b.load_data(df)         
b.build()
b.deep_legis_model.summary()
b.train()
#a = legislationDatasetPartisanLean(config)

Training size: (272, 8)
Validation size: (31, 8)
Test size: (31, 8)
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
partisan_lean (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
version_number (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
sc_id (InputLayer)              [(None, 16)]         0                                            
__________________________________________________________________________________________________
tf.concat_3 (TFOpLambda)        (None, 18)           0           partisan_lean[0][0]              
                        

In [None]:
# no-text, batch=4, time = 390s, or 9ms/step
# no-text, batch=64 time = 277s, or 107ms/step


In [8]:
model_location = DATA_VOL + "models/no_text2/full_model.h5"
b.deep_legis_model.save(model_location)