# About This Project

# Bert Model and Tokenizers

Hugginface provides pretrained models and architecture into a single lines

* tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
* model = AutoModel.from_pretrained("asafaya/bert-base-arabic")

# Look inside the dataset files

Dataset files are already divided into train and test dataset. 

In [5]:
import os
import sys

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib

from transformers import AutoTokenizer, AutoModel
import torch
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn,optim

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
pip install transformers
pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Dataset Module

Feature Engieering and data files

In [17]:
class ArabicDataset(Dataset):
    def __init__(self,data,max_len,model_type="Mini"):
        super().__init__()
        self.texts = data["text"].values
        self.max_len = max_len
        model = {"Mini": "asafaya/bert-mini-arabic",
                "Medium": "asafaya/bert-medium-arabic",
                "Base": "asafaya/bert-base-arabic",
                "Large": "asafaya/bert-large-arabic",
                 "Twitter":"aubmindlab/bert-base-arabertv02-twitter"}
        self.tokenizer = AutoTokenizer.from_pretrained(model[model_type])
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text = " ".join(self.texts[idx].split())
        inputs = self.tokenizer(text,padding='max_length',
                                max_length=self.max_len,truncation=True,return_tensors="pt")
        return {
            "inputs":{"input_ids":inputs["input_ids"][0],
                      "token_type_ids":inputs["token_type_ids"][0],
                      "attention_mask":inputs["attention_mask"][0],
                     }
        }
        


Peeking dataset module

In [18]:
class ArabicDataModule(pl.LightningDataModule):
    def __init__(self,test_path,batch_size=12,max_len=100,model_type="Mini"):
        super().__init__()
        self.test_path= test_path
        self.batch_size = batch_size
        self.max_len = max_len
        self.model_type = model_type
    
    def setup(self,stage=None):
        test = pd.read_csv(self.test_path)
        self.test_dataset = ArabicDataset(data=test,max_len=self.max_len,model_type=self.model_type)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset,batch_size=self.batch_size,shuffle=True,num_workers=4)
    

# Bert fine tuning Module

In [38]:
class ArabicBertModel(pl.LightningModule):
    def __init__(self,output,weight,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],output)
        self.weight=weight
    
    def forward(self,inputs):
        out = self.bert_model(**inputs)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        pooler = out[1]
        out = self.fc(pooler)
        return out
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)
    
    def criterion(self,output,target):
        return nn.CrossEntropyLoss(weight=self.weight)(output,target)
    
    #TODO: adding metrics
    def training_step(self,batch,batch_idx):
        x,y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        return loss
    
    def validation_step(self,batch,batch_idx):
        x,y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        return loss

# Results and Discussions

In [44]:
from tqdm.auto import tqdm


MODEL_TYPE = "Mini"
modelcategory= ArabicBertModel(output=10,weight=torch.tensor([9.0, 2.0, 1.0, 3.0, 1.0, 3.0, 3.0, 9.0, 9.0, 3.0]),model_type=MODEL_TYPE)
modelcategory = modelcategory.load_from_checkpoint(
    output=10,
    weight=torch.tensor([9.0, 2.0, 1.0, 3.0, 1.0, 3.0, 3.0, 9.0, 9.0, 3.0]),
    checkpoint_path="/content/lightning_logs/version_4/checkpoints/epoch=2-step=165.ckpt",
    hparams_file="/content/lightning_logs/version_4/hparams.yaml",
    map_location=None,
)

modelstance= ArabicBertModel(output=3,weight=torch.tensor([5.0,2.0,1.0]),model_type=MODEL_TYPE)
modelstance = modelstance.load_from_checkpoint(
    output=3,
    weight=torch.tensor([5.0,2.0,1.0]),
    checkpoint_path="/content/lightning_logs/version_0/checkpoints/epoch=2-step=165.ckpt",
    hparams_file="/content/lightning_logs/version_0/hparams.yaml",
    map_location=None,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modelcategory.to(device)


preds_cat = []
preds_stance = []

load = ArabicDataModule(test_path="/content/test.csv",batch_size=128,max_len=80)
load.setup()
test_dataloader = load.test_dataloader()

progress_bar = tqdm(range(len(test_dataloader)))

modelstance.eval()
modelcategory.eval()
for batch in test_dataloader:    
    x= batch["inputs"]
    inp = {k: v.to(device) for k, v in x.items()}
    
    with torch.no_grad():
        outputs_cat = modelcategory(inp)
        outputs_stance = modelstance(inp)

    predictions_cat = torch.argmax(outputs_cat, dim=-1)
    predictions_stance = torch.argmax(outputs_stance, dim=-1)
    
    preds_cat.extend(predictions_cat)
    preds_stance.extend(predictions_stance)

    progress_bar.update()
    
preds_cat = torch.stack(preds_cat).cpu()
preds_stance = torch.stack(preds_stance).cpu()

Some weights of the model checkpoint at asafaya/bert-mini-arabic were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at asafaya/bert-mini-arabic were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
le_decode_cat=joblib.load('/content/label_encoder.pkl')
le_decode_stance=joblib.load('/content/label_encoder_stance.pkl')
print(preds_cat)
print(preds_stance)

In [52]:
test = pd.read_csv("/content/test.csv")
test.loc[:,"category"] = le_decode_cat.inverse_transform(preds_cat)
test.loc[:,"stance"] = le_decode_stance.inverse_transform(preds_stance)
test.to_csv("test2.csv")