In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
import matplotlib.pyplot as plt

from transformers import LongformerTokenizer

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.read_parallel import read_parallel_local
from src.models.deeplegis import *
from src.models.data_loader import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)    
print(f"Original number of examples: {len(df)}")
df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

tmp = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df['text'] = tmp

df = df.reset_index(drop=True)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

Original number of examples: 334
Reduced number of examples:  334
Took 0.00030483404795328775 min to open 334 files with 20 processes.


In [5]:
df.head()

Unnamed: 0,id,version_number,bill_id,signed,partisan_lean,sc_id,text,sc_id_cat
0,2122561,1,1094864,1,0.608263,582-2,in the year of our lord two thousand nineteen...,14
1,2123646,1,1095397,0,0.195652,566-1,introduced version senate bill no. digest of ...,8
2,2124321,2,1095760,0,0.738123,592-1,s t a t e o f n e w y o r k regular sessions ...,15
3,2123441,1,1095276,0,0.608263,582-2,in the year of our lord two thousand nineteen...,14
4,2122605,1,1094881,1,0.608263,582-2,in the year of our lord two thousand nineteen...,14


In [6]:
!pwd

/home/luke/repos/govhawk_ml/notebooks


In [8]:
config = {}
config['max_length'] = 128
config['train_batch_size'] = 1
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = DATA_VOL + "models/all_data2/all_data.ckpt"
config['log_dir'] = DATA_VOL + "logs/fit/all_data2_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
config['epochs'] = 10
config['learning_rate'] = 1e-4
config['no_text_dense_layer_initialization_path']  = DATA_VOL + "models/no_text2/full_modol.h5"


dl_all = deepLegisAll(config)
dl_all.load_data(df)
dl_all.build()
dl_all.deep_legis_model.summary()
dl_all.train()

Training size: (272, 8)
Validation size: (31, 8)
Test size: (31, 8)


ResourceExhaustedError: OOM when allocating tensor with shape[50265,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:TruncatedNormal]

In [None]:
model_location = DATA_VOL + "models/all_data2/full_model.h5"
dl_all.deep_legis_model.save(model_location)