# NLP DL Pipeline (Training)

# Import Libraries

In [2]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.rcParams["figure.figsize"] = (12, 8)
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, StratifiedGroupKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# os.system('pip install -q bitsandbytes')

import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.optim import Optimizer
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd.function import InplaceFunction
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# PyTorch Lightning imports:
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping 
from pytorch_lightning.loggers import CSVLogger
from torchmetrics.classification import MulticlassAccuracy

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1

transformers.__version__: 4.21.3

env: TOKENIZERS_PARALLELISM=False


# Pipeline

## Directory Settings

In [3]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = 'arabic-poem-classification/poems.csv'
TEST_PATH = 'arabic-poem-classification/test.csv'

## Data Loading

In [23]:
# ====================================================
# Data Loading
# ====================================================
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"train.shape: {train_df.shape}")
display(train_df.head())

train.shape: (25000, 4)


Unnamed: 0,Title,Author,Category,Poem
0,هُنّئتَ بالعيدِ بل هُنّي بكَ العيدُ،,صفي الدين الحلي,العصر الأندلسي,هنءت بالعيد بل هني بك العيد فانت لجود بل ارث ...
1,يا بَديعَ الحُسنِ,المعتمد بن عباد,العصر الأندلسي,يا بديع الحسن والاح سان يا بدر الدياجي يا غزا...
2,وأوانس تدنو إذا اجتديت,الأبيوردي,العصر الأندلسي,وَأَوانِسٍ تَدنو إِذا اِجتَدِيَتْ بِحَديثها وَ...
3,حسبي رضاك من الدهر الذي عتبا,ابن دارج القسطلي,العصر الأندلسي,حسبي رضاك من الدهر الذي عتبا وجود كفيك لحظ ال...
4,ماذا يقول المادحو,ظافر الحداد,العصر الأندلسي,ماذا يقول المادحو نَ وأنت مُخترع الغَرائبْ أع...


In [30]:
train_df['Poem'] = "العنوان " + train_df['Title'] + " والكاتب هو " + train_df['Author'] + " :والنص هو " + train_df['Poem']

In [31]:
# Drop Duplicates
print('Samples Before Dropping: ', train_df.shape[0])
train_df = train_df.drop_duplicates(subset=['Poem']).reset_index(drop=True)
print('Samples After Dropping: ', train_df.shape[0])

Samples Before Dropping:  25000

Samples After Dropping:  20412


In [32]:
feats = ['Category']
for feat in feats: 
    Names = [f'{feat}_{x}' for x in train_df[feat].value_counts().keys().sort_values()]
    OHE_cols = pd.DataFrame(pd.get_dummies(train_df[feat]).values,index = train_df.index, columns = Names)
    train_df = pd.concat([train_df,OHE_cols],axis=1)
train_df.columns[-5:].values

array(['Category_العصر الأندلسي', 'Category_العصر الايوبي',
       'Category_العصر العباسي', 'Category_العصر المملوكي',
       'Category_العصر حديث'], dtype=object)

In [33]:
le = LabelEncoder()
df = pd.concat([train_df, test_df])
le.fit(df["Category"])

mapping = dict(zip(le.classes_, range(len((le.classes_)))))
mapping_rev = dict(zip(range(len((le.classes_))), le.classes_))

train_df['Category'] = train_df['Category'].transform(lambda x: mapping[x])

## CFG

In [53]:
class CFG:
    competition = 'Poem'   # Competition Name
    seed = 42
########################################################################################################
    # Status
    train = True          # Train Mode
    debug = False         # Debug Mode 
########################################################################################################
    # Data
    max_len = 256        
    batch_size = 16        
    valid_batch_size = 16
    num_workers = os.cpu_count()       # Threads in Data Loader
    target_cols = ['Category_العصر الأندلسي', 'Category_العصر الايوبي', 'Category_العصر العباسي', 'Category_العصر المملوكي', 'Category_العصر حديث']
########################################################################################################
    # Training
    model = 'CAMeL-Lab/bert-base-arabic-camelbert-ca'
    pretrained = True
    epochs = 5
    
    max_grad_norm = 10  # Prevent Gradient Explosion by Clipping Gradinet when exceeding this number
    freeze_n_layers = 0              # Freeze First n_layers of the Encoder
    layer_reinitialize_n = 1         # Reinitialize the last n layers of the encoder
 
    pooling = 'mean'                  
    features_type = 'weighted_layers_cls'  
    output_hidden_states = True     
    layer_start = 9                     
########################################################################################################
    # Optimizer
    encoder_lr = 1.5e-5         # Pretrained Model lr  (Deberta Model)
    decoder_lr = 1.5e-5         # Custom Model lr  (The new head of the model)
    
    eps = 1e-6                # Adam Parameters 
    betas=(0.9, 0.999)        # Adam Parameters
    weight_decay = 0.02
    
    precision = "16-mixed"
########################################################################################################
    # Scheduler
    use_scheduler = True    # Use Scheduler
    scheduler = 'cosine'      # 'cosine' or 'linear' or 'cosine_hard'
    num_cycles = 0.25
    num_warmup_steps = 1
    sch_interval = 'step'    # 'step' or 'epoch'
########################################################################################################
    # CV
    n_fold=5
    trn_fold=list(range(n_fold))

In [54]:
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

## Utils

In [55]:
# ====================================================
# Utils
# ====================================================
def get_score(y_preds, y_trues):
    metric = MulticlassAccuracy(num_classes=5).to(device)
    score = metric(y_preds, y_trues)
    return score

seed_everything(seed=CFG.seed)

Seed set to 42


42

## CV Split

In [56]:
# ====================================================
# CV split
# ====================================================
Fold = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train_df, train_df['Category'],groups=train_df['Author'])):
    train_df.loc[val_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)
display(train_df.groupby('fold').size())

fold
0    3371
1    4219
2    4746
3    4807
4    3269
dtype: int64

In [57]:
if CFG.debug:
    display(train_df.groupby('fold').size())
    train_df = train_df.sample(n=100, random_state=CFG.seed).reset_index(drop=True)
    display(train_df.groupby('fold').size())

## Tokenizer

In [58]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model, use_fast=True)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

## Dataset

In [23]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train_df['Poem'].fillna("").values, total=len(train_df))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep 
print(f"max_len: {CFG.max_len}")

  0%|          | 0/20409 [00:00<?, ?it/s]

max_len: 20017


In [59]:
# ====================================================
# Dataset
# ====================================================
# Split the sentences into tokens & Make the sentences have fixed length (padding for short - truncating for long)
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=256,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['Poem'].values
        self.labels = df[cfg.target_cols].values
        self.tabular_df = df[feats].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        tabular = torch.tensor(self.tabular_df[item], dtype=torch.float)
        return {'input_ids' : inputs['input_ids'], 
                'attention_mask' : inputs['attention_mask'], 
                'labels' : label}
    
def collate(inputs):
    # Dynamic padding (pad the inputs of the batch to the maximum input length of the batch.)
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

## Model

#### Last Hidden State Poolings

In [63]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weight_factor = weight_factor.to(device)
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

#### Get optimizer and scheduler

In [65]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "fc" in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
              
        return optimizer_parameters
    
    
def get_scheduler(cfg, optimizer, len_train_folds):
    num_train_steps = int(len_train_folds / cfg.batch_size * cfg.epochs)
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    elif cfg.scheduler == 'cosine_hard':
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
        
    return scheduler

#### The Model

In [66]:
# ====================================================
# Model
# ====================================================
class CustomModel(LightningModule):
    def __init__(self, cfg, criterion, train_folds, valid_folds, fold, pretrained=False, config_path=None):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.train_folds = train_folds
        self.valid_folds = valid_folds
        self.fold = fold
        self.pretrained = pretrained
        self.val_step_outputs = []
        self.val_step_labels = []

        
        # Configurations
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        else:
            self.config = torch.load(config_path)
        
        # Model
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        torch.save(self.model.config, OUTPUT_DIR+'config.pth')

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, len(cfg.target_cols))
        
        # initalize the header
        self._init_weights(self.fc)
                
        # Reinitialize the last n layers      
        self._re_init_layers(self.cfg.layer_reinitialize_n)
            
    def _init_weights(self, module: nn.Module):
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
                return "nn.Linear"
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
                return "nn.Embedding"
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
                return "nn.LayerNorm"
            return None
    
    def _re_init_layers(self, n_layers: int):
            """Reinitialize the last n layers """
            if n_layers >= 1:
                for layer in self.model.encoder.layer[-n_layers:]:
                    # Confirmed that it works with deberta v3. Other models may be different.
                    if hasattr(layer, "modules"):
                        for module in layer.modules():
                            for name, child in module.named_children():
                                init_type_name = self._init_weights(child)
                                if init_type_name is not None:
                                    print(f"{name} is re-initialized, type: {init_type_name}, {module.__class__}")
            
    def train_dataloader(self):
        train_dataset = TrainDataset(CFG, self.train_folds)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers,
                                  drop_last=True)
        return train_loader
    
    
    def val_dataloader(self):
        valid_dataset = TrainDataset(CFG, self.valid_folds)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.valid_batch_size,
                                  shuffle=False,
                                  num_workers=CFG.num_workers,
                                  drop_last=False)
        return valid_loader
    
    def training_step(self, batch, batch_idx):
        inputs, labels = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['labels']
        inputs = collate(inputs)
        batch_size = labels.size(0)

        y_preds = self(inputs)
        loss = self.criterion(y_preds, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        for param_group in self.trainer.optimizers[0].param_groups:
            lr = param_group["lr"]
        self.log("lr", lr, on_step=True, on_epoch=False, prog_bar=True)

        return loss

    
    def validation_step(self, batch, batch_idx):
        inputs, labels = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['labels']
        inputs = collate(inputs)
        batch_size = labels.size(0)

        y_preds = self(inputs)
        loss = self.criterion(y_preds, labels)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        self.val_step_outputs.append(y_preds)
        self.val_step_labels.append(labels)

        return loss

        
    def configure_optimizers(self):
        optimizer_parameters = get_optimizer_params(self.model,
                                                    encoder_lr=CFG.encoder_lr,
                                                    decoder_lr=CFG.decoder_lr,
                                                    weight_decay=CFG.weight_decay)
        
        optimizer = AdamW(optimizer_parameters,
                              lr=CFG.encoder_lr,
                              eps=CFG.eps,
                              betas=CFG.betas)
        
        scheduler = get_scheduler(CFG, optimizer, len(self.train_folds))
        lr_scheduler_dict = {"scheduler": scheduler, "interval": CFG.sch_interval}
        if self.cfg.use_scheduler:
            return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_dict}
        else:
            return {'optimizer': optimizer}
    
    def on_validation_epoch_end(self):
        all_preds = torch.cat(self.val_step_outputs)
        all_labels = torch.cat(self.val_step_labels)
        self.val_step_outputs.clear()
        self.val_step_labels.clear()
        
        all_preds = nn.Softmax(dim=1)(all_preds)
        score = get_score(all_preds, all_labels)
        self.log("accuracy_score", score, on_step=False, on_epoch=True, prog_bar=True)
        if self.trainer.global_rank == 0:
            print(f"\nEpoch: {self.current_epoch}, accuracy_score: {score}", flush=True)
           
    
    # Return hidden states 
    def feature(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # Weighted Layers CLS
        all_hidden_states = torch.stack(outputs.hidden_states)
        pooler = WeightedLayerPooling(
            self.config.num_hidden_layers, 
            layer_start=self.cfg.layer_start, layer_weights=None
        )
        weighted_pooling_embeddings = pooler(all_hidden_states)
        feature = weighted_pooling_embeddings[:, 0]
       
        return feature
    
    # The Model Architicture
    def forward(self, batch):
        feature = self.feature(batch['input_ids'], batch['attention_mask'])
        output = self.fc(feature)
        return output

## Training Loop

In [None]:
criterion = nn.BCEWithLogitsLoss()
for fold in CFG.trn_fold:
    print(f"Fold: {fold}")
    train_folds = train_df.loc[train_df['fold'] != fold].reset_index(drop=True)
    valid_folds = train_df.loc[train_df['fold'] == fold].reset_index(drop=True)
    
    model = CustomModel(cfg=CFG,
                        criterion=criterion,
                        train_folds=train_folds,
                        valid_folds=valid_folds,
                        fold=fold,
                        pretrained=True)

    checkpoint_callback = ModelCheckpoint(
        save_weights_only=True,
        monitor="accuracy_score",
        dirpath=OUTPUT_DIR,
        mode="max",
        filename=f"model-f{fold}-{{accuracy_score:.4f}}",
        save_top_k=1,
        verbose=1,
    )


    trainer = Trainer(max_epochs=CFG.epochs,devices=1,
                      accelerator='gpu', deterministic=False, 
                      precision=CFG.precision, strategy='auto',
                      callbacks=[checkpoint_callback],
                      logger=CSVLogger(save_dir=f'logs_f{fold}/'),)
    trainer.fit(model)

Fold: 0


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']

- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using 16bit Automatic Mixed Precision (AMP)

GPU available: True (cuda), used: True

TPU available

query is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

key is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

value is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertIntermediate'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertOutput'>




  | Name              | Type              | Params

--------------------------------------------------------

0 | criterion         | BCEWithLogitsLoss | 0     

1 | tabular_processor | Sequential        | 168   

2 | model             | BertModel         | 109 M 

3 | pool              | MeanPooling       | 0     

4 | fc                | Linear            | 3.8 K 

--------------------------------------------------------

85.6 M    Trainable params

23.4 M    Non-trainable params

109 M     Total params

436.341   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.5


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.7587882280349731


Epoch 0, global step 1065: 'accuracy_score' reached 0.75879 (best 0.75879), saving model to '/notebooks/model-f0-accuracy_score=0.7588.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 1, accuracy_score: 0.7894912362098694


Epoch 1, global step 2130: 'accuracy_score' reached 0.78949 (best 0.78949), saving model to '/notebooks/model-f0-accuracy_score=0.7895.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 2, accuracy_score: 0.8309106826782227


Epoch 2, global step 3195: 'accuracy_score' reached 0.83091 (best 0.83091), saving model to '/notebooks/model-f0-accuracy_score=0.8309.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 3, accuracy_score: 0.8448160886764526


Epoch 3, global step 4260: 'accuracy_score' reached 0.84482 (best 0.84482), saving model to '/notebooks/model-f0-accuracy_score=0.8448.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 4, accuracy_score: 0.8416271209716797


Epoch 4, global step 5325: 'accuracy_score' was not in top 1

`Trainer.fit` stopped: `max_epochs=5` reached.


Fold: 1


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']

- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using 16bit Automatic Mixed Precision (AMP)

GPU available: True (cuda), used: True

TPU available

query is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

key is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

value is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertIntermediate'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertOutput'>


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.5


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.6232519149780273


Epoch 0, global step 1012: 'accuracy_score' reached 0.62325 (best 0.62325), saving model to '/notebooks/model-f1-accuracy_score=0.6233.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 1, accuracy_score: 0.6296515464782715


Epoch 1, global step 2024: 'accuracy_score' reached 0.62965 (best 0.62965), saving model to '/notebooks/model-f1-accuracy_score=0.6297.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 2, accuracy_score: 0.672374963760376


Epoch 2, global step 3036: 'accuracy_score' reached 0.67237 (best 0.67237), saving model to '/notebooks/model-f1-accuracy_score=0.6724.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 3, accuracy_score: 0.6772635579109192


Epoch 3, global step 4048: 'accuracy_score' reached 0.67726 (best 0.67726), saving model to '/notebooks/model-f1-accuracy_score=0.6773.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 4, accuracy_score: 0.6474282741546631


Epoch 4, global step 5060: 'accuracy_score' was not in top 1

`Trainer.fit` stopped: `max_epochs=5` reached.


Fold: 2


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']

- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using 16bit Automatic Mixed Precision (AMP)

GPU available: True (cuda), used: True

TPU available

query is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

key is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

value is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertIntermediate'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertOutput'>


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.5


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.5864412188529968


Epoch 0, global step 979: 'accuracy_score' reached 0.58644 (best 0.58644), saving model to '/notebooks/model-f2-accuracy_score=0.5864.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 1, accuracy_score: 0.6420406699180603


Epoch 1, global step 1958: 'accuracy_score' reached 0.64204 (best 0.64204), saving model to '/notebooks/model-f2-accuracy_score=0.6420.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 2, accuracy_score: 0.6030604839324951


Epoch 2, global step 2937: 'accuracy_score' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 3, accuracy_score: 0.6332437992095947


Epoch 3, global step 3916: 'accuracy_score' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 4, accuracy_score: 0.6351137757301331


Epoch 4, global step 4895: 'accuracy_score' was not in top 1

`Trainer.fit` stopped: `max_epochs=5` reached.


Fold: 3


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']

- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Using 16bit Automatic Mixed Precision (AMP)

GPU available: True (cuda), used: True

TPU available

query is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

key is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

value is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfAttention'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertSelfOutput'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertIntermediate'>

dense is re-initialized, type: nn.Linear, <class 'transformers.models.bert.modeling_bert.BertOutput'>

LayerNorm is re-initialized, type: nn.LayerNorm, <class 'transformers.models.bert.modeling_bert.BertOutput'>


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.5


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 0, accuracy_score: 0.646713137626648


Epoch 0, global step 975: 'accuracy_score' reached 0.64671 (best 0.64671), saving model to '/notebooks/model-f3-accuracy_score=0.6467.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 1, accuracy_score: 0.6617953181266785


Epoch 1, global step 1950: 'accuracy_score' reached 0.66180 (best 0.66180), saving model to '/notebooks/model-f3-accuracy_score=0.6618.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 2, accuracy_score: 0.6448408365249634


Epoch 2, global step 2925: 'accuracy_score' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]



Epoch: 3, accuracy_score: 0.66795814037323


Epoch 3, global step 3900: 'accuracy_score' reached 0.66796 (best 0.66796), saving model to '/notebooks/model-f3-accuracy_score=0.6680.ckpt' as top 1
