# NLP DL Pipeline (Inference)
# By Mohamed Eltayeb

# Import Libraries

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import glob
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.rcParams["figure.figsize"] = (12, 8)
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.optim import Optimizer
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd.function import InplaceFunction
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# PyTorch Lightning imports:
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping 
from pytorch_lightning.loggers import CSVLogger

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.15.2
transformers.__version__: 4.38.1


2024-03-10 22:26:50.647067: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-10 22:26:50.647176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-10 22:26:50.795414: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


env: TOKENIZERS_PARALLELISM=False


# Pipeline

## 1- Directory Settings

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
TRAIN_PATH = '/kaggle/input/arabic-poem-classification/poems.csv'
TEST_PATH = '/kaggle/input/arabic-poem-classification/test.csv'
MODELS_PATH = '/kaggle/input/poem-dl-ckpts/3'

## 2- Data Loading

In [3]:
# ====================================================
# Data Loading
# ====================================================
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"test.shape: {test_df.shape}")
display(test_df.head())

test.shape: (4887, 4)


Unnamed: 0,ID,Title,Author,Poem
0,1,أرعى الأمانة لا أخون أمانتي,كعب بن زهير,أَرعى الأَمانَةَ لا أَخونُ أَمانَتي إِنَّ الخ...
1,2,شوق يروح مع الزمان ويغتدي,إيليا ابو ماضي,شَوقٌ يَروحُ مَعَ الزَمانِ وَيَغتَدي وَالشَوقُ...
2,3,مفتاحٌ وحيدٌ وحُجراتٌ ِعدة,محمود فرغلي,الي المفكر الفلسطيني الراحل /احمد صدقي الدجاني...
3,4,لي ابنُ عمٍ يجر الشر مجتهداً,ابن الرومي,لي ابن عم يجر الشر مجتهدا علي قدما ولا يصلي ل...
4,5,متى تَظْهَرِ النَّعماءُ تَشْجَ بها العِدَى,كشاجم,متي تظهر النعماء تشج بها العدي وليس لهم علم ب...


In [4]:
test_df['Poem'] = "العنوان " + test_df['Title'] + " والكاتب هو " + test_df['Author'] + " :والنص هو " + test_df['Poem']

In [5]:
le = LabelEncoder()
le.fit(train_df["Category"])

mapping = dict(zip(le.classes_, range(len((le.classes_)))))
mapping_rev = dict(zip(range(len((le.classes_))), le.classes_))

## 2- CFG

In [6]:
class CFG:
    competition = 'Poem'   # Competition Name
    seed = 42
########################################################################################################
    # Data
    max_len = 256         # Max Sentence length  (This is an inital value. The current value is chosen below in the dataset)
    batch_size = 1        
    valid_batch_size = 1
    num_workers = 4       # Threads in Data Loader
    target_cols = ['Category_العصر الأندلسي', 'Category_العصر الايوبي', 'Category_العصر العباسي', 'Category_العصر المملوكي', 'Category_العصر حديث']
########################################################################################################
    # Training
    model = 'CAMeL-Lab/bert-base-arabic-camelbert-ca'
    epochs = 5

    pooling = 'mean'                 
    features_type = 'weighted_layers_cls'  
    output_hidden_states = True
    layer_start = 12                     
########################################################################################################
    # Optimizer
    encoder_lr = 1.5e-5         # Pretrained Model lr  (Deberta Model)
    decoder_lr = 1.5e-5         # Custom Model lr  (The new head of the model)
    
    eps = 1e-6                # Adam Parameters 
    betas=(0.9, 0.999)        # Adam Parameters
    weight_decay = 0.02
    
    precision = "16-mixed"
########################################################################################################
    # Scheduler
    use_scheduler = True    # Use Scheduler
    scheduler = 'cosine'      # 'cosine' or 'linear' or 'cosine_hard'
    num_cycles = 0.25
    num_warmup_steps = 1
    sch_interval = 'step'    # 'step' or 'epoch'

## 4- Utils

In [7]:
# ====================================================
# Utils
# ====================================================
def get_score(y_preds, y_trues):
    metric = MulticlassAccuracy(num_classes=5).to(device)
    score = metric(y_preds, y_trues)
    return score

seed_everything(seed=CFG.seed)

42

## 6- Tokenizer

In [8]:
# ====================================================
# tokenizer
# ====================================================
CFG.tokenizer = AutoTokenizer.from_pretrained(MODELS_PATH+'/tokenizer/')

## 7- Dataset

In [9]:
# ====================================================
# Dataset
# ====================================================
# Split the sentences into tokens & Make the sentences have fixed length (padding for short - truncating for long)
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None,
        truncation=True,
        max_length=256,
        add_special_tokens=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['Poem'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return {'input_ids' : inputs['input_ids'], 
                'attention_mask' : inputs['attention_mask']}

## 8- Model

In [10]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weight_factor = weight_factor.to(device)
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

#### Hidden Layers Poolings

In [11]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "fc" in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
              
        return optimizer_parameters
    
    
def get_scheduler(cfg, optimizer, len_train_folds):
    num_train_steps = int(len_train_folds / cfg.batch_size * cfg.epochs)
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    elif cfg.scheduler == 'cosine_hard':
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
        
    return scheduler

#### The Model

In [12]:
# ====================================================
# Model
# ====================================================
class CustomModel(LightningModule):
    def __init__(self, cfg=CFG, criterion=None, train_folds=None, valid_folds=None, fold=None, pretrained=False, config_path=None):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.train_folds = train_folds
        self.valid_folds = valid_folds
        self.fold = fold
        self.pretrained = pretrained
        self.val_step_outputs = []
        self.val_step_labels = []
        
        # Configurations
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
            
        else:
            self.config = torch.load(config_path)
        
        # Model
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        torch.save(self.model.config, OUTPUT_DIR+'config.pth')
        self.pool = MeanPooling()
        
        self.fc = nn.Linear(self.config.hidden_size, len(cfg.target_cols))
        
            
    def train_dataloader(self):
        train_dataset = TrainDataset(CFG, self.train_folds)
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers,
                                  drop_last=True)
        return train_loader
    
    
    def val_dataloader(self):
        valid_dataset = TrainDataset(CFG, self.valid_folds)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.valid_batch_size,
                                  shuffle=False,
                                  num_workers=CFG.num_workers,
                                  drop_last=False)
        return valid_loader
    
    def training_step(self, batch, batch_idx):
        inputs, labels = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['labels']
        inputs = collate(inputs)
        batch_size = labels.size(0)

        y_preds = self(inputs)
        loss = self.criterion(y_preds, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        for param_group in self.trainer.optimizers[0].param_groups:
            lr = param_group["lr"]
        self.log("lr", lr, on_step=True, on_epoch=False, prog_bar=True)

        return loss

    
    def validation_step(self, batch, batch_idx):
        inputs, labels = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}, batch['labels']
        inputs = collate(inputs)
        batch_size = labels.size(0)

        y_preds = self(inputs)
        loss = self.criterion(y_preds, labels)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        self.val_step_outputs.append(y_preds)
        self.val_step_labels.append(labels)

        return loss

        
    def configure_optimizers(self):
        optimizer_parameters = get_optimizer_params(self.model,
                                                    encoder_lr=CFG.encoder_lr,
                                                    decoder_lr=CFG.decoder_lr,
                                                    weight_decay=CFG.weight_decay)
        
        optimizer = AdamW(optimizer_parameters,
                              lr=CFG.encoder_lr,
                              eps=CFG.eps,
                              betas=CFG.betas)
        
        scheduler = get_scheduler(CFG, optimizer, len(self.train_folds))
        lr_scheduler_dict = {"scheduler": scheduler, "interval": CFG.sch_interval}
        if self.cfg.use_scheduler:
            return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_dict}
        else:
            return {'optimizer': optimizer}
    
    def on_validation_epoch_end(self):
        all_preds = torch.cat(self.val_step_outputs)
        all_labels = torch.cat(self.val_step_labels)
        self.val_step_outputs.clear()
        self.val_step_labels.clear()
        
        all_preds = nn.Softmax(dim=1)(all_preds)
        score = get_score(all_preds, all_labels)
        self.log("accuracy_score", score, on_step=False, on_epoch=True, prog_bar=True)
        if self.trainer.global_rank == 0:
            print(f"\nEpoch: {self.current_epoch}, accuracy_score: {score}", flush=True)
           
    
    # Return hidden states 
    def feature(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # Weighted Layers CLS
        all_hidden_states = torch.stack(outputs.hidden_states)
        pooler = WeightedLayerPooling(
            self.config.num_hidden_layers, 
            layer_start=self.cfg.layer_start, layer_weights=None
        )
        weighted_pooling_embeddings = pooler(all_hidden_states)
        feature = weighted_pooling_embeddings[:, 0]
                    
        return feature
    
    # The Model Architicture
    def forward(self, batch):
        feature = self.feature(batch['input_ids'], batch['attention_mask'])
        output = self.fc(feature)
        return output

## 11- Inference

In [13]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
            y_preds = nn.Softmax(dim=1)(y_preds)
        preds.append(y_preds.to('cpu').numpy())
        del y_preds, inputs[k]
    predictions = np.concatenate(preds)
    return predictions


test_dataset = TestDataset(CFG, test_df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

predictions = []
checkpoint_dir = f"{MODELS_PATH}"
checkpoint_paths = glob.glob(f"{checkpoint_dir}/*.ckpt")
for checkpoint_path in tqdm(checkpoint_paths):
        model = CustomModel.load_from_checkpoint(checkpoint_path, map_location=device, strict=False, config_path=MODELS_PATH+"/config.pth")
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, prediction; gc.collect()
        torch.cuda.empty_cache()


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

  0%|          | 0/4887 [00:00<?, ?it/s]

In [14]:
predictions = np.mean(predictions, axis=0)
predictions = pd.Series(predictions.argmax(axis=1)).map(mapping_rev)

In [15]:
predictions

0        العصر العباسي
1           العصر حديث
2           العصر حديث
3        العصر العباسي
4        العصر العباسي
             ...      
4882        العصر حديث
4883    العصر المملوكي
4884     العصر الايوبي
4885        العصر حديث
4886     العصر الايوبي
Length: 4887, dtype: object

# Submission

In [16]:
# Dictionary to store authors who are in train
drop_authors = train_df.groupby('Author')['Category'].nunique().sort_values().tail(17).keys().values # Authors that lived in more than one era
Authors_in_train_test = dict(train_df.drop_duplicates(subset=['Author'])[['Author','Category']].values)
Authors_in_train_test = {author: category for author, category in Authors_in_train_test.items() if author not in drop_authors}

In [17]:
# Dictionary to store titles that are in train
Titles_in_train_test = dict(train_df.drop_duplicates(subset=['Title'])[['Title','Category']].values)

In [18]:
submission = pd.DataFrame({"ID": test_df.ID ,"Target": predictions.values})

# Map any Authors in train into the test
cls = test_df['Author'].map(Authors_in_train_test).dropna()
submission.loc[cls.index,'Target'] = cls.values

# Map any Titles in train into the test
cls = test_df['Title'].map(Titles_in_train_test).dropna()
submission.loc[cls.index,'Target'] = cls.values

In [19]:
submission = submission.sort_values('ID').reset_index(drop=True)
submission.to_csv('submission.csv',index=False)
submission

Unnamed: 0,ID,Target
0,1,العصر العباسي
1,2,العصر حديث
2,3,العصر حديث
3,4,العصر العباسي
4,5,العصر العباسي
...,...,...
4882,4883,العصر حديث
4883,4884,العصر المملوكي
4884,4885,العصر العباسي
4885,4886,العصر حديث
