In [None]:
%reload_ext autoreload
%autoreload 2
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
%%capture
!pip install transformers

In [None]:
%%capture
!pip install git+https://github.com/fastai/fastai


In [None]:
from fastai.basics import *
from fastai.text.all import *
from fastai.callback.all import *
from fastai.interpret import *
import sys
import IPython.utils.io

from functools import partial

from dsnlplib import *


In [None]:
class BertPPClfier(TransformerClassifier):
    transformer_cls = AutoModelForSequenceClassification
    variation = 'bert'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.loadPretrained()
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None,):
        
        attention_mask = (input_ids!=1).type(input_ids.type()) 
        logits = self.transformer(input_ids, attention_mask = attention_mask)[0] 
        
        return logits

In [None]:
class BertRepFromPPClfier(TransformerClassifier):
    transformer_cls = AutoModelForSequenceClassification
    variation = 'bertfinetuned'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.classifier2 = nn.Linear(13, 23)
        self.activation = nn.ReLU()

        self.loadPretrained()
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None,):
        
        attention_mask = (input_ids!=1).type(input_ids.type()) 
        logits = self.transformer(input_ids, attention_mask = attention_mask)[0] 
        
        logits = self.classifier2(logits)
        
        return logits

In [None]:
c = DSConfig()

In [None]:
c.bs = 32
c.max_seq_len = 512  


c.eps=0.0001

c.lr=1e-6
c.epochs=2000
c.patience=20
c.use_activ=True

# Imponiamo il numero di 
c.num_labels = 13

In [None]:
c.pretraineds = ['dbmdz/bert-base-italian-xxl-uncased', 
                 './process_properties'
               #'m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0'
               ]
c.models = [
            (BertClfier,empty_config),
            (BertLast4ClsTokenClfier,empty_config),
            (BertLast4ClsTokenDenseClfier,empty_config),
            (BertLast4CnnClfier, cnn_config),
            (BertLast4PadCnnClfier, cnn_config),
            (BertPPClfier, cnn_config),
            (BertRepFromPPClfier, cnn_config),
]

In [None]:
c.results = {}

In [None]:
c.pretrain_id = c.pretraineds[0]

c.tokenizer = AutoTokenizer.from_pretrained(c.pretrain_id)
tokenizer_vocab=c.tokenizer.get_vocab() 
c.tokenizer_vocab_ls = [k for k, v in sorted(tokenizer_vocab.items(), key=lambda item: item[1])]

c.max_seq_len = min(c.max_seq_len,c.tokenizer.max_len)
  
tok_func = FastHugsTokenizer(transformer_tokenizer=c.tokenizer, model_name=c.pretrain_id, max_seq_len=c.max_seq_len, sentence_pair=c.sentence_pair)

c.fai_tokenizer = Tokenizer.from_df(text_cols='text', res_col_name='text', tok=tok_func, rules=[])

In [None]:
c.df = pd.read_csv('data/texts2.1.max_512.train.balanced_rep.csv')
c.test_df = pd.read_csv('data/texts2.1.max_512.test.balanced_rep.csv')

In [None]:
c.df['label'] = c.df['Rep']
c.test_df['label'] = c.test_df['Rep']

In [None]:
c.df['labels'] = c.df['Rep'].apply(assignPP)
c.df['labels'] = c.df['labels'].apply(str.split)

In [None]:
c.test_df['labels'] = c.test_df['Rep'].apply(assignPP)
c.test_df['labels'] = c.test_df['labels'].apply(str.split)

In [None]:
# fit on process properties first
c.model_idx = 5

In [None]:
model = dsc.models[dsc.model_idx]

In [None]:
(model_cls, config) = model
model_name = model_cls.__name__

In [None]:
config_dict = AutoConfig.from_pretrained(dsc.pretrain_id)
print("\n\nWeights: %s - Model: %s" % (dsc.pretrain_id, model_name), flush= True)


In [None]:
x_tfms = [attrgetter("text"), dsc.fai_tokenizer, Numericalize(vocab=dsc.tokenizer_vocab_ls)]


In [None]:
y_tfms = [attrgetter("labels"), MultiCategorize(),OneHotEncode()]

In [None]:
splits = ColSplitter()(dsc.df)

In [None]:
dsets = Datasets(dsc.df, splits=splits, tfms=[x_tfms, y_tfms], dl_type=SortedDL)


In [None]:
padding=transformer_padding(dsc.tokenizer)
dls = dsets.dataloaders(bs=dsc.bs, before_batch=[padding])

In [None]:
config_dict.num_labels = dsc.num_labels if dsc.num_labels is not None else dls.c


In [None]:
exp = DSExperiment(c)

In [None]:
opt_func = partial(Adam, decouple_wd=True, eps=dsc.eps)

In [None]:
useRocAuc = (dls.c <= 3)


In [None]:
fai_model = model_cls(config_dict = config_dict, model_name = dsc.pretrain_id, pretrained = True, use_activ=dsc.use_activ)


In [None]:
metrics = [accuracy_multi,F1ScoreMulti(average='macro')]

In [None]:
if (useRocAuc):
  metrics.append(RocAuc(multi_class='ovo'))

# current date and time
now = datetime.now()

timestamp = now.isoformat(sep='_', timespec='seconds')

fname_id = dsc.pretrain_id + '/' + timestamp + ' ' + model_name + ' - lr: ' + str(dsc.lr)

fname = '/content/drive/My Drive/dnlp_models/' + fname_id

In [None]:
learn = Learner(dls, fai_model, loss_func=BCEWithLogitsLossFlat(), metrics=metrics, cbs=[SaveModelCallback(fname=fname),EarlyStoppingCallback(patience=dsc.patience),ReduceLROnPlateau(patience=dsc.plateau_patience)], splitter=fai_model.transformer_spltr).to_fp16()


In [None]:
learn.freeze_to(0)

In [None]:
learn.fit(dsc.epochs, lr=dsc.lr, wds=1e-4, use_wd_sched=True)

In [None]:
process_p_pretraining = '/content/drive/My Drive/dnlp_pretrainings/pp_uncased_ans'

In [None]:
!mkdir -p process_p_pretraining

In [None]:
learn.model.transformer.save_pretrained(process_p_pretraining)

In [None]:
c.freeze_to = 3

In [None]:
c.pretrain_id = process_p_pretraining

In [None]:
c.model_idx = 6

In [None]:
# fit on repertoires
exp.run()

In [None]:
exp.benchmark()