In [1]:
import torch
from torch.utils import data
import random
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from collections import Counter
import pandas as pd
import numpy as np
import scipy
from tqdm import trange
from tqdm import tqdm
from datetime import datetime
import sys
import os
import seaborn as sns
from matplotlib import pyplot as plt
from joblib import Parallel, delayed, dump, load
from matplotlib import pyplot as plt
from sparse_vector.sparse_vector import SparseVector
from scipy.signal import convolve2d, convolve
import time
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, f1_score
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
ASSEMBLY_d = {}
chroms_d = {}
all_features_d = {}
groups_d = {}
feature_names_d = {}
ZDNA_d = {}
black_list_d = {}
DNA_d = {}
DNA_features_d = {}

# MM9

In [3]:
ASSEMBLY = "curax_14h_UNI_mm9"
chroms = [f'chr{i}' for i in list(range(1, 20)) + ['X', 'Y']]
all_features = sorted([i[:-4] for i in os.listdir('../data/mm9_features/sparse/') if i.endswith('.pkl')])
groups = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
feature_names = [i for i in all_features if (i.split('_')[0] in groups)]
ZDNA = load(f'../data/mm9_zdna/sparse/{ASSEMBLY}.pkl')
black_list = load(f'../data/mm9_zdna/sparse/blacklist_mm9.pkl')


In [4]:
DNA = {chrom:load(f'../data/mm9_dna/sparse/{chrom}.pkl') for chrom in tqdm(chroms)}

DNA_features = {feture: load(f'../data/mm9_features/sparse/{feture}.pkl')
                for feture in tqdm(feature_names)}

for feature in tqdm(DNA_features):
    if set(DNA_features[feature].keys()) != set(chroms):
        for chrom in chroms:
            if chrom not in DNA_features[feature]:
                DNA_features[feature][chrom] = SparseVector(len(DNA[chrom]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:43<00:00,  2.09s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 873/873 [01:35<00:00,  9.15it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 873/873 [00:00<00:00, 177395.83it/s]


In [5]:
mode = 'mm9'
ASSEMBLY_d[mode] = ASSEMBLY
chroms_d[mode] = chroms
all_features_d[mode] = all_features
groups_d[mode] = groups
feature_names_d[mode] = feature_names
ZDNA_d[mode] = ZDNA
black_list_d[mode] = black_list
DNA_d[mode] = DNA
DNA_features_d[mode] = DNA_features

# HG19

In [6]:
ASSEMBLY = "ZDNA_2016"
chroms = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y']]
all_features = sorted([i[:-4] for i in os.listdir('../data/hg19_features/sparse/') if i.endswith('.pkl')])
groups = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
feature_names = [i for i in all_features if (i.split('_')[0] in groups)]
ZDNA = load(f'../data/hg19_zdna/sparse/{ASSEMBLY}.pkl')
black_list = load(f'../data/hg19_zdna/sparse/blacklist_hg19.pkl')


In [7]:
def chrom_reader(chrom):
    files = sorted([i for i in os.listdir(f'../data/hg19_dna/') if f"{chrom}_" in i])
    return ''.join([load(f"../data/hg19_dna/{file}") for file in files])


DNA = {chrom:chrom_reader(chrom) for chrom in tqdm(chroms)}

DNA_features = {feture: load(f'../data/hg19_features/sparse/{feture}.pkl')
                for feture in tqdm(feature_names)}

for feature in tqdm(DNA_features):
    if set(DNA_features[feature].keys()) != set(chroms):
        for chrom in chroms:
            if chrom not in DNA_features[feature]:
                DNA_features[feature][chrom] = SparseVector(len(DNA[chrom]))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [02:26<00:00,  6.12s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1054/1054 [00:58<00:00, 17.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1054/1054 [00:00<00:00, 133389.55it/s]


In [8]:
mode = 'hg19'
ASSEMBLY_d[mode] = ASSEMBLY
chroms_d[mode] = chroms
all_features_d[mode] = all_features
groups_d[mode] = groups
feature_names_d[mode] = feature_names
ZDNA_d[mode] = ZDNA
black_list_d[mode] = black_list
DNA_d[mode] = DNA
DNA_features_d[mode] = DNA_features

In [9]:
intersect = {i.upper() for i in DNA_features_d['mm9']} & {i.upper() for i in DNA_features_d['hg19']}

In [10]:
def to_bed(source, name):
    buf = []
    for chrm in source[name]:
        beds = source[name][chrm].indices[source[name][chrm].data != 0]
        data = source[name][chrm].data[source[name][chrm].data != 0].astype(float).astype(int)
        ends = np.append(source[name][chrm].indices[1:], 
              [source[name][chrm].shape])[source[name][chrm].data != 0]

        buf.extend([[chrm, beds[i], ends[i], data[i]] for i in range(len(beds))])
    buf = np.array(buf)
    df = pd.DataFrame([buf[:, 0], buf[:, 1], buf[:, 2], '', buf[:, 3], "", buf[:, 1], buf[:, 2], ""]).T
    df[3] = ''
    df[5] = "+"
    df[8] = ''
    return df

In [11]:
#gen = 'hg19'
#
#for name in tqdm([i for i in DNA_features_d[gen] if i.upper() in intersect and 'TFs and others' in i]):
#    df = to_bed(DNA_features_d[gen], name)
#    df.to_csv(f'tmp/{gen}_{name}.bed', sep = '\t', index=False, header=None)
##     break

In [12]:
def func(DNA_features_d, name, gen):
    df = to_bed(DNA_features_d, name)
    df.to_csv(f'tmp/{gen}_{name}.bed', sep = '\t', index=False, header=None)

In [13]:
#from joblib import Parallel, delayed
#gen = 'hg19'
#Parallel(n_jobs = -1)(delayed(func)(DNA_features_d[gen], name, gen)
#                     for name in tqdm([i for i in DNA_features_d[gen] 
#                                       if i.upper() in intersect and 'TFs and others' in i]))

# Data part

In [14]:
mode = 'mm9'
ASSEMBLY = ASSEMBLY_d[mode]
chroms = chroms_d[mode]
all_features = all_features_d[mode]
groups = groups_d[mode]
feature_names = feature_names_d[mode]
ZDNA = ZDNA_d[mode]
black_list = black_list_d[mode]
DNA = DNA_d[mode]
DNA_features = DNA_features_d[mode]

In [None]:
width = 512

np.random.seed(10)

ints_in = []
ints_out = []


for chrm in chroms:
    for st in trange(0, ZDNA[chrm].shape - width, width):
        interval = [st, min(st + width, ZDNA[chrm].shape)]
        N_count = sum([bp == "N" for bp in DNA[chrm][interval[0]:interval[1]]])
        bl_count = black_list[chrm][interval[0]:interval[1]].sum()
        if N_count > width / 2 or bl_count > 0:
            continue
        else:
            if ZDNA[chrm][interval[0]: interval[1]].any():
                ints_in.append([chrm, int(interval[0]), int(interval[1]), 1])
            else:
                ints_out.append([chrm, int(interval[0]), int(interval[1]), 0])


                
                
print(len(ints_in))
print(len(ints_out))

ints_in_full = ints_in
ints_out_full = ints_out
#29745
#5443119

#17017
#4843301

In [19]:
ints_in = ints_in_full
ints_out = [ints_out_full[i] for i in np.random.choice(range(len(ints_out_full)), 
                                                    size=len(ints_in) * 4, replace=False)]
# ints_out = ints_out_full

print(len(ints_in))
print(len(ints_out))
#484 for len 1000 
#9680 for len 1000

#629 for len 512
#12580 for len 5121

#17017 - MM kouzine
#34034

2790
11160


In [20]:
equalized = ints_in + ints_out

In [21]:
divisions = list(StratifiedKFold(5, shuffle=True, 
                                 random_state=42).split(equalized, [f"{elem[3]}_{elem[0]}"
                                         for i, elem 
                                         in enumerate(equalized)]))

In [22]:
dump([equalized, divisions], 'mm_divisions_chipseq.pkl', 3)

['mm_divisions_chipseq.pkl']

In [15]:
class Dataset(data.Dataset):
    def __init__(self, chroms, features, 
                 dna_source, features_source, 
                 labels_source, intervals, tokenizer):

        self.chroms = chroms
        self.features = features
        self.dna_source = dna_source
        self.features_source = features_source
        self.labels_source = labels_source
        self.intervals = intervals
        self.le = LabelBinarizer().fit(np.array([["A"], ["C"], ["T"], ["G"]]))
        self.configs = {
                        'ZHUNT_AS': {
                                'CG': 0, 'GC': 1, 'CA': 0, 'AC': 1, 
                                'TG': 0, 'GT': 1, 'TA': 1, 'AT': 1, 
                                'CC': 0, 'GG': 0, 'CT': 1, 'TC': 1, 
                                'GA': 1, 'AG': 1, 'AA': 1, 'TT': 1},
                       }
        seqs = (["A", "C", "T", "G"] + 
                ['AC', 'AT', 'AG', 'CT', 'CG', 'GT'] +
                ['AAC', 'ACC', 'AAT', 'ATT', 'AAG', 'AGG', 
                 'CCA', 'CAA', 'CCT', 'CTT', 'CCG', 'CGG', 
                 'TTA', 'TAA', 'TTC', 'TCC', 'TTG', 'TGG', 
                 'GGA', 'GAA', 'GGC', 'GCC', 'GGT', 'GTT'] +
                ['AAAC', 'AAAT', 'AAAG', 'CCCA', 'CCCT', 'CCCG',
                 'TTTA', 'TTTC', 'TTTG', 'GGGA', 'GGGC', 'GGGT'])
        self.tars = np.array([self.le.transform(list(i * 11)[:11]) for i in seqs])[:, ::-1, ::-1]
        # purine-pyrimidine
        self.tars = np.concatenate((self.tars, np.array([self.tars[4] + self.tars[9]])))
        self.tokenizer = tokenizer
        
        
    def __len__(self):
        return len(self.intervals)
    
    def __getitem__(self, index):
        interval = self.intervals[index]
        chrom = interval[0]
        begin = int(interval[1])
        end = int(interval[2])
        ll = list(self.dna_source[chrom][begin:end].upper())
        y = self.labels_source[interval[0]][interval[1]: interval[2]]        
        
        
#         DNA PART
        
        dna_OHE = self.le.transform(ll)[None]
        
        res = pd.DataFrame(convolve(dna_OHE, self.tars)[:, 5:-5, 3].T / 11)
        res = (res.rolling(5, min_periods=1).max().values == 1).astype(int)
        
        
#         ZHUNT PART
        zhunts = []
        for key in self.configs:
            vec = np.array(ll)
            vec = np.vectorize(lambda x:self.configs[key].get(x, 0))(
                                    np.char.add(vec[1:], vec[:-1]))
            zhunts.append(np.concatenate([vec, [0]]))
        
        
        # FEATURES PART
        feature_matr = []
        for feature in self.features:
            source = self.features_source[feature]
            feature_matr.append(source[chrom][begin:end])
        
        # UNION
        if len(feature_matr) > 0:
            X = np.hstack((
                           res,
                           np.array(zhunts).T, 
                           np.array(feature_matr).T/1000)).astype(np.float32)
#             X = (np.array(feature_matr).T/1000).astype(np.float32)
        else:
            X = dna_OHE.astype(np.float32)
        
        #K-mer part
        
        k_mers = seq2kmer(self.dna_source[chrom][begin:end+5].upper(),6)
        encoded_k_mers = self.tokenizer.encode_plus(k_mers, add_special_tokens=False, max_length=512)["input_ids"]

        return torch.Tensor(X), torch.Tensor(y).long(), ll, torch.LongTensor(encoded_k_mers), (chrom, begin, end)


In [16]:
import torch
from transformers import BertModel, BertConfig, PreTrainedTokenizer, BasicTokenizer, BertForTokenClassification
import collections
from torch.utils.data import DataLoader
import sklearn
from sklearn.metrics import accuracy_score
from torch.nn import CrossEntropyLoss

In [17]:
from dna_tokenizer import DNATokenizer, seq2kmer

In [18]:
#%load_ext autoreload
#%autoreload 2

In [27]:
for MODEL_NUMBER in range(5):

    train_inds, test_inds = divisions[MODEL_NUMBER]
    train_intervals, test_intervals = [equalized[i] for i in train_inds], [equalized[i] for i in test_inds]

    random.shuffle(train_intervals)
    random.shuffle(test_intervals)
    
    train_dataset = Dataset(chroms, 
                        [i for i in feature_names if i.upper() in intersect], 
                       DNA, DNA_features, ZDNA, train_intervals, 
                        DNATokenizer.from_pretrained('6-new-12w-0/', add_special_tokens=False))

    test_dataset = Dataset(chroms, 
                       [i for i in feature_names if i.upper() in intersect], 
                       DNA, DNA_features, ZDNA, test_intervals,
                          DNATokenizer.from_pretrained('6-new-12w-0/', add_special_tokens=False))
    
    dump((train_dataset, test_dataset), f'ds_w_seq_mm_fold{MODEL_NUMBER}_chipseq.pkl')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DNATokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DNATokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DNATokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected 

In [19]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        features, labels, sequences, input_ids, intervals = batch            
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        #print(model.device, input_ids.device, labels.device)
        outputs = model(input_ids = input_ids, labels = labels)        
        #print(outputs)
        loss, tr_logits = outputs['loss'], outputs['logits']
        #print(model(input_ids=ids, attention_mask=mask, labels=labels))
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 1000==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 1000 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=0.1
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [20]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels, eval_scores = [], [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            features, labels, sequences, input_ids, intervals = batch            
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            outputs = model(input_ids = input_ids, labels = labels)
            loss, eval_logits = outputs['loss'], outputs['logits']            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            flattened_scores = active_logits[:,1] - active_logits[:,0]
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            eval_scores.extend(flattened_scores)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [id.item() for id in eval_labels]
    predictions = [id.item() for id in eval_preds]
    scores = [id.item() for id in eval_scores]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions, scores

In [21]:
device = 2
lr = 1e-5
EPOCHS = 3

dir_to_pretrained_model = "6-new-12w-0/"
config = BertConfig.from_pretrained('https://raw.githubusercontent.com/jerryji1993/DNABERT/master/src/transformers/dnabert-config/bert-config-6/config.json')
tokenizer = DNATokenizer.from_pretrained('6-new-12w-0/')

for MODEL_NUMBER in range(5):
    train_dataset, test_dataset = load(f'ds_w_seq_mm_fold{MODEL_NUMBER}_chipseq.pkl')
    training_loader = DataLoader(train_dataset, batch_size=24, num_workers = 2)
    testing_loader = DataLoader(test_dataset, batch_size=16, num_workers = 2)


    model = BertForTokenClassification.from_pretrained(dir_to_pretrained_model, config=config)
    model.to(device)
    
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=len(training_loader), epochs=EPOCHS)

    for epoch in range(EPOCHS):
        print(f"Training fold {MODEL_NUMBER} epoch: {epoch + 1}")
        train(epoch)
        labels, predictions, scores = valid(model, testing_loader)
        print(f'Fold {MODEL_NUMBER} validation ROC-AUC: ', roc_auc_score(labels, scores))

    print(sklearn.metrics.classification_report(labels, predictions))
    model.save_pretrained(f'dnabert_mm_fold_{MODEL_NUMBER}_chipseq')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DNATokenizer'.
Some weights of the model checkpoint at 6-new-12w-0/ were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint

Training fold 0 epoch: 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.9019289612770081
Training loss epoch: 0.390990713214682
Training accuracy epoch: 0.7786216817876346


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.2893085777759552
Validation loss per 100 evaluation steps: 0.19142763610781716
Validation Loss: 0.19388019256825959
Validation Accuracy: 0.9129459635416666
Fold 0 validation ROC-AUC:  0.9152589970573095
Training fold 0 epoch: 2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.27432429790496826
Training loss epoch: 0.17078884827593963
Training accuracy epoch: 0.9187281235999102


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.35805413126945496
Validation loss per 100 evaluation steps: 0.19680613191281954
Validation Loss: 0.19955283463267343
Validation Accuracy: 0.9143617466517857
Fold 0 validation ROC-AUC:  0.9203813335745563
Training fold 0 epoch: 3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.23713773488998413
Training loss epoch: 0.15734239765593122
Training accuracy epoch: 0.9228419368839603


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.261796236038208
Validation loss per 100 evaluation steps: 0.18414040824333852
Validation Loss: 0.18562142755171018
Validation Accuracy: 0.9171788969494048
Fold 0 validation ROC-AUC:  0.9289988531594595
              precision    recall  f1-score   support

           0       0.94      0.97      0.96   1293701
           1       0.59      0.43      0.50    134779

    accuracy                           0.92   1428480
   macro avg       0.76      0.70      0.73   1428480
weighted avg       0.91      0.92      0.91   1428480



Some weights of the model checkpoint at 6-new-12w-0/ were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 6-new-12w-0/ and are newly initialized: ['cl

Training fold 1 epoch: 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.6302614808082581
Training loss epoch: 0.30637917340763154
Training accuracy epoch: 0.8611674647177423


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.14372193813323975
Validation loss per 100 evaluation steps: 0.161706479321612
Validation Loss: 0.16975399463304452
Validation Accuracy: 0.9195498511904763
Fold 1 validation ROC-AUC:  0.933106083589137
Training fold 1 epoch: 2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.2172079235315323
Training loss epoch: 0.1706182946160596
Training accuracy epoch: 0.919665308579749


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.14538171887397766
Validation loss per 100 evaluation steps: 0.15554694553818738
Validation Loss: 0.16339435535350016
Validation Accuracy: 0.9280110677083334
Fold 1 validation ROC-AUC:  0.9405204600959461
Training fold 1 epoch: 3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.1824580878019333
Training loss epoch: 0.15795552009536373
Training accuracy epoch: 0.9246776293682797


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.1413867473602295
Validation loss per 100 evaluation steps: 0.15600424470370727
Validation Loss: 0.16356893891867783
Validation Accuracy: 0.9284147135416666
Fold 1 validation ROC-AUC:  0.9416896526810581
              precision    recall  f1-score   support

           0       0.95      0.97      0.96   1303706
           1       0.62      0.48      0.54    124774

    accuracy                           0.93   1428480
   macro avg       0.78      0.73      0.75   1428480
weighted avg       0.92      0.93      0.92   1428480



Some weights of the model checkpoint at 6-new-12w-0/ were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 6-new-12w-0/ and are newly initialized: ['cl

Training fold 2 epoch: 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.725899875164032
Training loss epoch: 0.3347647682533309
Training accuracy epoch: 0.8314769895273296


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.20398662984371185
Validation loss per 100 evaluation steps: 0.18108568849539994
Validation Loss: 0.17435172477737068
Validation Accuracy: 0.9192792038690477
Fold 2 validation ROC-AUC:  0.9274415488056525
Training fold 2 epoch: 2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.08359429240226746
Training loss epoch: 0.17188839246268556
Training accuracy epoch: 0.9170203643033153


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.21153324842453003
Validation loss per 100 evaluation steps: 0.1744884059195047
Validation Loss: 0.16894876425620167
Validation Accuracy: 0.9231382533482143
Fold 2 validation ROC-AUC:  0.9339321975910378
Training fold 2 epoch: 3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.07044116407632828
Training loss epoch: 0.1594217058791909
Training accuracy epoch: 0.9208574848790332


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.21745924651622772
Validation loss per 100 evaluation steps: 0.1768932773363961
Validation Loss: 0.170953735573816
Validation Accuracy: 0.9244577752976191
Fold 2 validation ROC-AUC:  0.9351622292340511
              precision    recall  f1-score   support

           0       0.96      0.96      0.96   1299707
           1       0.58      0.57      0.57    128773

    accuracy                           0.92   1428480
   macro avg       0.77      0.76      0.77   1428480
weighted avg       0.92      0.92      0.92   1428480



Some weights of the model checkpoint at 6-new-12w-0/ were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 6-new-12w-0/ and are newly initialized: ['cl

Training fold 3 epoch: 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.5655170679092407
Training loss epoch: 0.2926735123799693
Training accuracy epoch: 0.8674540420586919


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.05221162736415863
Validation loss per 100 evaluation steps: 0.16747685139168902
Validation Loss: 0.17474155860819987
Validation Accuracy: 0.917274925595238
Fold 3 validation ROC-AUC:  0.9273685191207058
Training fold 3 epoch: 2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.05828924849629402
Training loss epoch: 0.1699431810447926
Training accuracy epoch: 0.9205190132168466


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.05175241082906723
Validation loss per 100 evaluation steps: 0.15489764381568916
Validation Loss: 0.1632653524566974
Validation Accuracy: 0.9234477306547619
Fold 3 validation ROC-AUC:  0.9383045856399669
Training fold 3 epoch: 3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.05467894300818443
Training loss epoch: 0.15622173561664518
Training accuracy epoch: 0.9262352290546598


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.04368581995368004
Validation loss per 100 evaluation steps: 0.15531754974128292
Validation Loss: 0.16460440543613264
Validation Accuracy: 0.9242254929315477
Fold 3 validation ROC-AUC:  0.9394133950652016
              precision    recall  f1-score   support

           0       0.95      0.97      0.96   1304495
           1       0.58      0.44      0.50    123985

    accuracy                           0.92   1428480
   macro avg       0.77      0.71      0.73   1428480
weighted avg       0.92      0.92      0.92   1428480



Some weights of the model checkpoint at 6-new-12w-0/ were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 6-new-12w-0/ and are newly initialized: ['cl

Training fold 4 epoch: 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.5906921029090881
Training loss epoch: 0.29405208105041136
Training accuracy epoch: 0.8746774543570781


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.4613910913467407
Validation loss per 100 evaluation steps: 0.1875938862967904
Validation Loss: 0.1863775824116809
Validation Accuracy: 0.9132958984375
Fold 4 validation ROC-AUC:  0.9186021946416052
Training fold 4 epoch: 2


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.2078123688697815
Training loss epoch: 0.16978204527529336
Training accuracy epoch: 0.9191957535282266


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.35741469264030457
Validation loss per 100 evaluation steps: 0.1787099362037486
Validation Loss: 0.17595555954745837
Validation Accuracy: 0.9188739304315476
Fold 4 validation ROC-AUC:  0.9283722877582139
Training fold 4 epoch: 3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training loss per 1000 training steps: 0.21094347536563873
Training loss epoch: 0.15776515444039657
Training accuracy epoch: 0.9234343497983878


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Validation loss per 100 evaluation steps: 0.3797488212585449
Validation loss per 100 evaluation steps: 0.18024767841452197
Validation Loss: 0.17766747937670777
Validation Accuracy: 0.9206794084821428
Fold 4 validation ROC-AUC:  0.9301610849947218
              precision    recall  f1-score   support

           0       0.95      0.96      0.96   1298979
           1       0.57      0.49      0.53    129501

    accuracy                           0.92   1428480
   macro avg       0.76      0.73      0.74   1428480
weighted avg       0.92      0.92      0.92   1428480

