In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
import matplotlib.pyplot as plt

from transformers import LongformerTokenizer

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.read_parallel import read_parallel_local
from src.models.deeplegis import *
from src.models.data_loader import *

In [3]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 199646
Reduced number of examples:  199646
Took 2.1606207966804503 min to open 199646 files with 20 processes.


In [4]:
config = {}
config['model_name'] = 'no_text'
config['build_from_scratch'] = True
config['max_length'] = 128
config['train_batch_size'] = 128
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = DATA_VOL + "models/" + config['model_name'] +"/" + config['model_name'] +".ckpt"
config['log_dir'] = DATA_VOL + "logs/fit/"+config['model_name']+"_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
config['epochs'] = 20
config['learning_rate'] = 1e-3
config['model_location'] = model_location = DATA_VOL + "models/"+config['model_name']+"/full_model.h5"

b = deepLegisNoText(config)
b.load_data(df)         
b.build()
b.deep_legis_model.summary()
b.train()
#a = legislationDatasetPartisanLean(config)

Training size: (163509, 8)
Validation size: (18168, 8)
Test size: (17969, 8)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
partisan_lean (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
version_number (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
sc_id (InputLayer)              [(None, 134)]        0                                            
__________________________________________________________________________________________________
tf.concat (TFOpLambda)          (None, 136)          0           partisan_lean[0][0]              
                 


Epoch 00018: saving model to /home/luke/tmp_volmodels/no_text/no_text.ckpt
Epoch 19/20

Epoch 00019: saving model to /home/luke/tmp_volmodels/no_text/no_text.ckpt
Epoch 20/20

Epoch 00020: saving model to /home/luke/tmp_volmodels/no_text/no_text.ckpt


<tensorflow.python.keras.callbacks.History at 0x7f9f38884460>

In [5]:
# no-text, batch=4, time = 390s, or 9ms/step
# no-text, batch=64 time = 277s, or 107ms/step


In [8]:
b.deep_legis_model.save(config['model_location'])