In [16]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [17]:
# Define a custom dataset class for search matching
class SearchMatchingDataset(Dataset):
    def __init__(self, queries, items, labels, tokenizer, max_length):
        self.queries = queries
        self.items = items
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        item = self.items[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            query,
            item,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float)
        }



In [18]:
# Define a search matching model using AraBERT
class SearchMatchingModel(nn.Module):
    def __init__(self, bert_model):
        super(SearchMatchingModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 1)  # Output a single score for similarity
        self.dropout = nn.Dropout(0.1)  # Dropout layer for regularization

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token for classification
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        similarity_score = torch.sigmoid(self.fc(pooled_output))
        return similarity_score


In [29]:
# queries = [
#     "كيفية تحضير كعكة الشوكولاته",
#     "أفضل مسارات المشي في كاليفورنيا",
#     "دروس برمجة بالبايثون"
# ]
# items = [
#     "وصفة كعكة الشوكولاته: تعلم كيفية تحضير كعكة شوكولاته لذيذة من الصفر.",
#     "استكشف جمال كاليفورنيا مع هذه المسارات المشي الرائعة.",
#     "احترف برمجة البايثون مع دروسنا الشاملة والأمثلة."
# ]
# labels = [1, 1, 0]  # 1 for relevant, 0 for irrelevant
################################################################################
################################################################################
queries = [
    "تغميض العينين",
    "اللعب بالعينين",
    "النوم ع اليمين",
    "الليل",
    "نلعب كورة",
    "الصبح"
]
items =  [
"كراهة تغميض العينين في الصلاة",
"كراهة تغميض العينين في الصلاة",
"اضطجاعه بَعْدَ الْفَجْرِ عَلَى شِقِّهِ الْأَيْمَنِ",
"قِيَامِ اللَّيْلِ",
"الرَّكْعَتَانِ بَعْدَ الْوِتْرِ",
"صَلَاةِ الضُّحَى",
]

import re
def remove_tashkeel_from_list(texts):
    def remove_tashkeel(text):
        tashkeel = u'\u0617-\u061A\u064B-\u0652'  # Range of Arabic diacritics
        pattern = "[" + tashkeel + "]"
        return re.sub(pattern, '', text)
    texts_without_tashkeel = [remove_tashkeel(text) for text in texts]
    return texts_without_tashkeel

items = remove_tashkeel_from_list(items)

labels = [1, 0,1,1,0,1]


In [30]:
# Initialize AraBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")

In [31]:
# Define maximum sequence length and batch size
max_length = 128
batch_size = 3

In [32]:
# Create dataset and data loader
dataset = SearchMatchingDataset(queries, items, labels, tokenizer, max_length)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [33]:
# Initialize the search matching model
model = SearchMatchingModel(bert_model)

In [34]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()


In [35]:
# Define early stopping parameters
patience = 2
best_loss = float('inf')
early_stop_counter = 0

In [36]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        optimizer.zero_grad()
        similarity_score = model(input_ids, attention_mask)
        loss = criterion(similarity_score, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    epoch_loss = running_loss / len(data_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

    # Early stopping
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch [1/10], Loss: 0.8043
Epoch [2/10], Loss: 0.5177
Epoch [3/10], Loss: 0.5367
Epoch [4/10], Loss: 0.2567
Epoch [5/10], Loss: 0.2893
Epoch [6/10], Loss: 0.1790
Epoch [7/10], Loss: 0.1874
Epoch [8/10], Loss: 0.1039
Epoch [9/10], Loss: 0.0613
Epoch [10/10], Loss: 0.0536


In [37]:
# Evaluate the model (optional)
model.eval()
with torch.no_grad():
    all_similarities = []
    for query, item in zip(queries, items):
        encoding = tokenizer.encode_plus(
            query,
            item,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        similarity_score = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0)).item()
        all_similarities.append(similarity_score)

In [38]:
# Print similarity scores
for i, (query, item) in enumerate(zip(queries, items)):
    print(f"Query: {query}")
    print(f"Item: {item}")
    print(f"Similarity score: {all_similarities[i]}")
    print()


Query: تغميض العينين
Item: كراهة تغميض العينين في الصلاة
Similarity score: 0.9771350622177124

Query: اللعب بالعينين
Item: كراهة تغميض العينين في الصلاة
Similarity score: 0.050732821226119995

Query: النوم ع اليمين
Item: اضطجاعه بعد الفجر على شقه الأيمن
Similarity score: 0.990502119064331

Query: الليل
Item: قيام الليل
Similarity score: 0.9936110377311707

Query: نلعب كورة
Item: الركعتان بعد الوتر
Similarity score: 0.020095685496926308

Query: الصبح
Item: صلاة الضحى
Similarity score: 0.993548572063446

