1 — Install libraries

---



In [1]:
!pip install --upgrade transformers
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install protobuf==3.20.3


Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m136.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m


2 — Imports and GPU check

---



In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))


GPU available: True
Device: Tesla T4


3 — Upload and load TSV files

---



In [2]:
from google.colab import files

uploaded = files.upload()  # upload dev_cleaned.tsv and test_cleaned.tsv

# Load TSV files
dev_df = pd.read_csv("dev_cleaned.tsv", sep='\t')
test_df = pd.read_csv("test_cleaned.tsv", sep='\t')

# Strip column whitespace
dev_df.columns = dev_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

print(dev_df.columns.tolist())
print(dev_df.head())


Saving test_cleaned.tsv to test_cleaned.tsv
Saving dev_cleaned.tsv to dev_cleaned.tsv
['#1 tweet_ID', 'norm_text', 'label_country', 'label_province']
  #1 tweet_ID                             norm_text label_country  \
0       Dev_1           ايسكو لاعب اليوم اسيست وهدف          Iraq   
1       Dev_2                    بعد صلاه الفجر بقا         Egypt   
2       Dev_3   ان شاء الله هذه المره يكون من نصيبي       Algeria   
3       Dev_4  ههههههههههههههههه خلي السوداني يزغبك         Yemen   
4       Dev_5         كل حاجه محسوبه يا جماعه والله         Egypt   

   label_province  
0       iq_Ninawa  
1      eg_Monufia  
2         dz_Oran  
3  ye_Al-Hudaydah  
4  eg_South-Sinai  


4 — Map labels with Non-Moroccan handling

---



In [4]:
# Add "NonMoroccan" class for non-Moroccan tweets
def map_label(row):
    if row.get("label_country", "") == "Morocco":
        return row["label_province"]
    else:
        return "NonMoroccan"

# Apply mapping
dev_df["final_label"] = dev_df.apply(map_label, axis=1)
test_df["final_label"] = "Unknown"  # placeholder

# Since we don't have a separate train set, use dev as train too for demonstration
train_df = dev_df.copy()
print(f"Training samples: {len(morocco_train)}, Dev samples: {len(morocco_dev)}")


NameError: name 'morocco_train' is not defined

5 — Encode labels

---



In [5]:
le = LabelEncoder()
train_df["label_enc"] = le.fit_transform(train_df["final_label"])
dev_df["label_enc"]   = le.transform(dev_df["final_label"])

# Test dataset: placeholder encoding
test_df["label_enc"] = -1

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)


{'NonMoroccan': np.int64(0), 'ma_Marrakech-Tensift-Al-Haouz': np.int64(1), 'ma_Meknes-Tafilalet': np.int64(2), 'ma_Oriental': np.int64(3), 'ma_Souss-Massa-Draa': np.int64(4), 'ma_Tanger-Tetouan': np.int64(5)}


6 — Load tokenizer

In [6]:
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 128


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

7 — PyTorch Dataset class

---



In [7]:
class MoroccoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


8 — Prepare datasets and dataloaders

---



In [8]:
train_dataset = MoroccoDataset(
    train_df['norm_text'].tolist(),
    train_df['label_enc'].tolist(),
    tokenizer,
    max_len
)

dev_dataset = MoroccoDataset(
    dev_df['norm_text'].tolist(),
    dev_df['label_enc'].tolist(),
    tokenizer,
    max_len
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader   = DataLoader(dev_dataset, batch_size=32)


9 — Load model

---



In [9]:
num_labels = len(train_df['label_enc'].unique())
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)
model.to(device)


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

10 — Training loop

---



In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()
epochs = 3

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    print(f"Dev Accuracy: {correct/total:.4f}")


100%|██████████| 310/310 [01:45<00:00,  2.94it/s]


Epoch 1, Train Loss: 0.3066
Dev Accuracy: 0.9510


100%|██████████| 310/310 [01:51<00:00,  2.78it/s]


Epoch 2, Train Loss: 0.2350
Dev Accuracy: 0.9550


100%|██████████| 310/310 [01:50<00:00,  2.81it/s]


Epoch 3, Train Loss: 0.1956
Dev Accuracy: 0.9595


12 — Inference example (handle Non-Moroccan input)

---



In [11]:
def predict(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    tokens = {k: v.to(device) for k,v in tokens.items()}
    model.eval()
    with torch.no_grad():
        output = model(**tokens)
        pred = torch.argmax(output.logits, dim=1).item()
        label = le.inverse_transform([pred])[0]
    return label

# Examples
print(predict("هادي جملة مغربية بزاف"))  # Moroccan city
print(predict("شو الأخبار؟"))             # NonMoroccan
print(predict("ليش هيك بتحكو؟"))          # NonMoroccan


ma_Oriental
NonMoroccan
NonMoroccan


13 — Install Gradio and develop interface for predicition

---



In [12]:
!pip install gradio




In [13]:
import gradio as gr

def gradio_predict(text):
    # Tokenize input
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    tokens = {k: v.to(device) for k,v in tokens.items()}
    model.eval()
    with torch.no_grad():
        output = model(**tokens)
        pred = torch.argmax(output.logits, dim=1).item()
        label = le.inverse_transform([pred])[0]
    return label

# Create Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(lines=2, placeholder="Type Arabic text here..."),
    outputs=gr.Label(num_top_classes=1),
    title="Moroccan Dialect Identifier",
    description="Predicts the Moroccan city or 'NonMoroccan' for other Arabic dialects."
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2b5b68b2d4e75295ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Data for teting

---
Sample Moroccan tweets / sentences
Text	Expected label
"أنا غادي لجامع فمراكش"	Marrakesh /
"شنو كاين اليوم فطنجة؟"	Tangier /
"هادي جملة مغربية بزاف"	Rabat /
Other Arabic dialects → NonMoroccan
Text	Expected label
"إزيك عامل إيه؟"	NonMoroccan (Egyptian)
"كيفك اليوم؟"	NonMoroccan (Levantine)
"شلونك حبيبي؟"	NonMoroccan (Iraqi)
"ليش هيك بتحكو؟"	NonMoroccan (Levantine)


In [None]:
model.save_pretrained("./arabert_morocco_model_manual")
tokenizer.save_pretrained("./arabert_morocco_model_manual")


('./arabert_morocco_model_manual/tokenizer_config.json',
 './arabert_morocco_model_manual/special_tokens_map.json',
 './arabert_morocco_model_manual/vocab.txt',
 './arabert_morocco_model_manual/added_tokens.json',
 './arabert_morocco_model_manual/tokenizer.json')