In [1]:
pip install transformers torch




In [2]:
from transformers import AutoTokenizer, AutoModel

# Load the tokenizer and model for AraBERT
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
araber_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
import pandas as pd

# Load the dataset
file_path = 'data.csv'  # Replace with your file path
data = pd.read_csv(file_path)
data['Metaphorical Density'] = pd.to_numeric(data['Metaphorical Density'], errors='coerce')

In [4]:
# Preprocess the text data
def preprocess_verses(verses, tokenizer, max_length):
    encoded = tokenizer(
        verses.tolist(),  # Convert series to list
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors="pt"  # Return PyTorch tensors
    )
    return encoded['input_ids'], encoded['attention_mask']

# Example preprocessing
input_ids, attention_mask = preprocess_verses(data['البيت'], tokenizer, max_length=22)

Y data processing

In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assume data is already loaded
features_to_encode = ['العصر', 'الشاعر', 'الديوان', 'القافية', 'البحر', 'Semantic']


# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
one_hot_encoded = encoder.fit_transform(data[features_to_encode])

# Create a DataFrame with the encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(features_to_encode))

# Concatenate the one-hot encoded columns with the original DataFrame
y = pd.concat([one_hot_df,data['Metaphorical Density']], axis=1)

In [24]:
semantic_cols = [
    "Semantic_acceptance", "Semantic_admiration", "Semantic_ambiguity", "Semantic_anger", "Semantic_anguish", 
    "Semantic_anticipation", "Semantic_anxiety", "Semantic_awe", "Semantic_bittersweet", "Semantic_calm", 
    "Semantic_caution", "Semantic_concern", "Semantic_confidence", "Semantic_conflict", "Semantic_confusion", 
    "Semantic_contempt", "Semantic_contentment", "Semantic_curiosity", "Semantic_defiance", "Semantic_desire", 
    "Semantic_despair", "Semantic_determination", "Semantic_disgust", "Semantic_fear", "Semantic_frustration", 
    "Semantic_gratitude", "Semantic_hope", "Semantic_joy", "Semantic_longing", "Semantic_love", "Semantic_melancholy", 
    "Semantic_neutral", "Semantic_nostalgia", "Semantic_optimism", "Semantic_pessimism", "Semantic_pride", "Semantic_regret", 
    "Semantic_revenge", "Semantic_romantic", "Semantic_sadness", "Semantic_sorrow", "Semantic_surprise", "Semantic_trust", 
    "Semantic_trusted", "Semantic_urgency", "Semantic_warning", "Semantic_wonder"
]

era_cols = [
    "العصر_المخضرمين", "العصر_قبل الإسلام","العصر_الأموي", "العصر_العباسي"
]

density_col = [
    "Metaphorical Density"
]

asr = y[era_cols]
semantic = y[semantic_cols]
density = y[density_col]

num_classes_asr = len(set(data['العصر']))
num_classes_semantic = len(set(data['Semantic']))

In [28]:
from torch.utils.data import Dataset

class ArabicPoetryDataset(Dataset):
    def __init__(self, input_ids, attention_mask, y_asr, y_semantic, y_density):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.y_asr = y_asr
        self.y_semantic = y_semantic
        self.y_density = y_density

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attention_mask[idx],
            self.y_asr[idx],
            self.y_semantic[idx],
            self.y_density[idx]
        )


In [None]:
import torch

# Convert one-hot encoded columns and density to tensors
y_asr = torch.tensor(asr.values, dtype=torch.float32)  # One-hot encoded ASR
y_semantic = torch.tensor(semantic.values, dtype=torch.float32)  # One-hot encoded Semantic
y_density = torch.tensor(density.values, dtype=torch.float32).view(-1, 1)  # Reshape for regression


tensor([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]])


In [35]:
from torch.utils.data import DataLoader

# Create the dataset
dataset = ArabicPoetryDataset(input_ids, attention_mask, y_asr, y_semantic, y_density)

# Create the DataLoader
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [38]:
import torch.nn as nn
import torch

class AraBERTMultitaskModel(nn.Module):
    def __init__(self, araber_model, num_asr_classes, num_semantic_classes):
        super(AraBERTMultitaskModel, self).__init__()
        self.araber = araber_model
        self.dropout = nn.Dropout(0.3)
        
        # Output layers
        self.asr_output = nn.Linear(768, num_asr_classes)  # AraBERT hidden size = 768
        self.semantic_output = nn.Linear(768, num_semantic_classes)
        self.density_output = nn.Linear(768, 1)  # Single output for numerical prediction

    def forward(self, input_ids, attention_mask):
        # Get embeddings from AraBERT
        outputs = self.araber(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # CLS token representation
        pooled_output = self.dropout(pooled_output)
        
        # Multitask outputs
        asr_logits = self.asr_output(pooled_output)
        semantic_logits = self.semantic_output(pooled_output)
        density_value = self.density_output(pooled_output)
        
        return asr_logits, semantic_logits, density_value

# Instantiate the model
model = AraBERTMultitaskModel(araber_model, num_asr_classes = num_classes_asr, num_semantic_classes=num_classes_semantic)  # Adjust classes

In [43]:
import torch.optim as optim

# Loss functions
criterion_asr = nn.CrossEntropyLoss()  # For categorical output
criterion_semantic = nn.CrossEntropyLoss()  # For categorical output
criterion_density = nn.MSELoss()  # For numerical output

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [None]:
# Example training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, y_asr, y_semantic, y_density = batch
        
        optimizer.zero_grad()
        
        # Forward pass
        asr_logits, semantic_logits, density_value = model(input_ids, attention_mask)
        print(asr_logits)
        
        # Compute losses
        loss_asr = criterion_asr(asr_logits, y_asr)
        loss_semantic = criterion_semantic(semantic_logits, y_semantic)
        loss_density = criterion_density(density_value.squeeze(), y_density)
        
        # Combine losses
        loss = loss_asr + loss_semantic + loss_density
        # print(loss)
        # print(loss_asr)       
        # print(loss_density)
        # print(loss_semantic) 
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], grad_fn=<AddmmBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


tensor([[nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan],
        [nan, nan, nan, nan]], grad_fn=<AddmmBackward0>)


KeyboardInterrupt: 