In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import tensorflow as tf
import sys
import os
from sklearn.preprocessing import LabelEncoder
import datetime
import matplotlib.pyplot as plt

from transformers import DistilBertTokenizer

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data.read_parallel import read_parallel_local
from src.models.deeplegis import *
from src.models.data_loader import *

In [2]:
REDUCE_BY_FACTOR = 1 # Make the dataset smaller for development purposes
train_test_ratio = 0.91
train_valid_ratio = 0.90

if 'DATA_VOL' not in os.environ:
    # Manually set:
    DATA_VOL = '/home/luke/tmp_vol/'
else:
    DATA_VOL = os.environ['DATA_VOL']
    
# Pre-wrangled metadata
df = pd.read_csv("../references/derived/ml_data.csv", encoding="latin1", parse_dates=True)
df.id = df.id.astype(int)
sc_id_encoder = LabelEncoder()
df['sc_id_cat'] = sc_id_encoder.fit_transform(df['sc_id'])

print(f"Original number of examples: {len(df)}")
if REDUCE_BY_FACTOR != 1:
    df = df.sample(n=int(len(df)/REDUCE_BY_FACTOR)) #
print(f"Reduced number of examples:  {len(df)}")

df['text'] = read_parallel_local(df['id'], DATA_VOL + "/clean/")
df = df[~df.text.isna()]

df = df.reset_index(drop=True)



Original number of examples: 199646
Reduced number of examples:  199646
Took 0.13571716944376627 min to open 199646 files with 20 processes.


In [3]:
sum(df.text.isna())

0

In [7]:
from src.models.deeplegis import *
config = {}
config['build_from_scratch'] = True
config['model_name'] = 'distil_bert_512'
config['max_length'] = 512
config['train_batch_size'] = 8
config['testing'] = False
config['train_test_ratio'] = 0.91
config['train_valid_ratio'] = 0.90 
config['tokenizer'] = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
config['n_sc_id_classes'] = len(sc_id_encoder.classes_)
config['checkpoint_path'] = DATA_VOL + "models/" + config['model_name'] +"/" + config['model_name'] +".ckpt"
config['log_dir'] = DATA_VOL + "logs/fit/"+config['model_name']+"_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
config['epochs'] = 10
config['learning_rate'] = 1e-4
config['no_text_dense_layer_initialization_path']  = DATA_VOL + "models/"+config['model_name']+"/full_modol.h5"
config['model_location'] = model_location = DATA_VOL + "models/"+config['model_name']+"/full_model.h5"

dl_all = deepLegisDistillBertAll(config)
dl_all.load_data(df)
dl_all.build()
dl_all.deep_legis_model.summary()
dl_all.train()

Training size: (161112, 9)
Validation size: (17902, 9)
Test size: (17705, 9)
Building from scratch.


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_41']
You should probably TRAIN this model on a down-stream task to be able to use i

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
distilbert (TFDistilBertMainLay TFBaseModelOutput(la 66362880    input_ids[0][0]                  
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None, 768)          0           distilbert[0][0]                 
__________________________________________________________________________________________________
dropout_42 (Dropout)            (None, 768)          0           tf.__operators__.getitem_1[0][0] 
____________________________________________________________________________________________




Epoch 00001: saving model to /home/luke/tmp_vol/models/distil_bert_512/distil_bert_512.ckpt
Epoch 2/10

Epoch 00002: saving model to /home/luke/tmp_vol/models/distil_bert_512/distil_bert_512.ckpt
Epoch 3/10
 2158/20139 [==>...........................] - ETA: 5:48:49 - loss: 0.2921 - binary_accuracy: 0.8810 - precision_1: 0.6995 - recall_1: 0.4057 - auc_1: 0.8681

KeyboardInterrupt: 

In [None]:

dl_all.deep_legis_model.save(model_location)

In [5]:
#config['model_location'] = model_location = DATA_VOL + "models/"+config['model_name']+"/full_model.h5"

#dl_all2 = deepLegisDistillBertAll(config)
#dl_all.load_data(df)
#dl_all2.build()
#dl_all.deep_legis_model.summary()
#dl_all.train(