<h1>ARABIC DIALECT CLASSIFICATION</h1>
<h5>MOHAMAD EL KASSAR</h5>

<h3>Dataset Loading</h3>

In [1]:
file = open('workspace/deep-learning/AAI612_kassar/project2/data/IADD.json', 'r', encoding='utf-8')
import json
data = json.load(file)
file.close()

In [2]:
import pandas as pd
df = pd.DataFrame(data)


<h3>Dataset Analysis</h3>

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135804 entries, 0 to 135803
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Sentence    135804 non-null  object
 1   Region      135804 non-null  object
 2   DataSource  135804 non-null  object
 3   Country     135804 non-null  object
dtypes: object(4)
memory usage: 4.1+ MB


In [4]:
df.describe()

Unnamed: 0,Sentence,Region,DataSource,Country
count,135804,135804,135804,135804
unique,126389,6,5,10
top,اه,LEV,SHAMI,Syria
freq,181,87573,66247,44972


In [7]:
df['DataSource'].value_counts()

DataSource
SHAMI    66247
PADIC    36065
AOC      19713
TSAC     11998
DART      1781
Name: count, dtype: int64

In [8]:
df['Country'].value_counts()

Country
Syria        44972
Palestine    17855
NA           16441
Algeria      14426
Tunisia      11998
Lebanon      10829
Morocco       7213
Jordan        7017
Egypt         4837
Iraq           216
Name: count, dtype: int64

In [9]:
df['Region'].value_counts()

Region
LEV        87573
MGH        33996
GLF         6682
EGY         4837
general     2500
IRQ          216
Name: count, dtype: int64

<h3>Data Balancing Strategy</h3>

In [11]:
df_filtered = df[df['Region'] != 'IRQ']

min_count = df_filtered['Region'].value_counts().min()

balanced_df = (
    df_filtered
    .groupby('Region', group_keys=False)
    .apply(lambda x: x.sample(min_count, random_state=42))
)

balanced_df = balanced_df[balanced_df['Region'] != 'general']

balanced_df = balanced_df.reset_index(drop=True)

balanced_df['Region'].value_counts()

  .apply(lambda x: x.sample(min_count, random_state=42))


Region
EGY    2500
GLF    2500
LEV    2500
MGH    2500
Name: count, dtype: int64

In [12]:
df_processed = balanced_df.copy()
df_processed = df_processed[['Sentence', 'Region']]

<h3>Text Cleaning</h3>

In [14]:
import re

def preprocess_arabic_text(text):

    if pd.isna(text):
        return ""

    text = re.sub(r'\s+', ' ', text)

    text = re.sub(r'http[s]?://\S+', '', text)

    text = text.strip()

    #remove non-Arabic characters
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)

    #remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

df_processed['processed_text'] = df_processed['Sentence'].apply(preprocess_arabic_text)

print("Dataset size after preprocessing:", df_processed.shape)

#remove empty texts
df_processed = df_processed[df_processed['processed_text'] != ""]

print("Dataset size after removing empty texts:", df_processed.shape)

Dataset size after preprocessing: (9918, 3)
Dataset size after removing empty texts: (9918, 3)


<h3>Encoding the Region</h3>

In [15]:
from sklearn.preprocessing import LabelEncoder

# Encoding region labels
region_encoder = LabelEncoder()
df_processed['region_encoded'] = region_encoder.fit_transform(df_processed['Region'])

print("Encoded Regions:\n", df_processed['region_encoded'].value_counts())

Encoded Regions:
 region_encoded
0    2500
1    2500
2    2500
3    2418
Name: count, dtype: int64


</h3>Preparing Features and Labels</h3>

In [17]:
# prepare features and labels
X = df_processed['processed_text'].tolist()
y = df_processed['region_encoded'].values

In [18]:
num_classes = len(region_encoder.classes_)
print(num_classes)

4


<h3>Train/Validation/Test Splitting</h3>

In [19]:
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y  # Smaller test set
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp  # Smaller validation set
)

In [21]:
max_length = max([len(x) for x in X])
MAXLEN = max_length

print(f"There are {len(X_train)} sentences for training.")
print(f"There are {len(X_val)} sentences for validation.")
print(f"There are {len(X_test)} sentences for validation.")

There are 7587 sentences for training.
There are 843 sentences for validation.
There are 1488 sentences for validation.


Categorical Label Conversion

In [20]:
from tensorflow.keras.utils import to_categorical

y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_val_cat = to_categorical(y_val, num_classes=num_classes)

2025-07-28 06:42:18.450386: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-28 06:42:18.623985: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-28 06:42:19.265203: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-28 06:42:19.776005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753684940.229858    1485 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753684940.35

Tokenization

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer

def fit_tokenizer(train_sentences, num_words):

    tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')

    tokenizer.fit_on_texts(train_sentences)

    return tokenizer

Due to computational constraints, vocabulary was limited

In [24]:
NUM_WORDS = 1000

tokenizer = fit_tokenizer(X_train, NUM_WORDS)
word_index = tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words\n")

Vocabulary contains 36957 words



In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

Sequence Padding

In [24]:
PADDING = 'post'

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_length = max(len(seq) for seq in X_train_seq + X_val_seq)

train_padded_seq = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
val_padded_seq = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

print(f"Padded training sequences have shape: {train_padded_seq.shape}\n")
print(f"Padded validation sequences have shape: {val_padded_seq.shape}")

Padded training sequences have shape: (7587, 6400)

Padded validation sequences have shape: (843, 6400)


<h2>Model Architectures Implementation</h2>

LSTM Model

In [26]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras import Sequential

def create_model(num_words, maxlen):

    model = Sequential([
        Embedding(input_dim=num_words, output_dim=128),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),  # Stacked LSTM
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    optimizer = Adam(learning_rate=0.001, clipnorm=1.0)

    model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    model.summary()

    return model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

BATCH_SIZE = 64
NUM_EPOCHS = 10

model = create_model(NUM_WORDS, MAXLEN)
callback = EarlyStopping(monitor='loss', patience=2)

history = model.fit(
    train_padded_seq,
    y_train_cat,
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(val_padded_seq, y_val_cat),
    callbacks=[callback],
    verbose=1,
)

Epoch 1/10


Fine-tuned BERT Model

Base Model: bert-base-multilingual-cased

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

def create_fine_tuned_bert(num_classes, model_name='bert-base-multilingual-cased'):

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_classes
    )
    
    model.compile(
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    model.summary()

    return model, tokenizer

In [None]:
def prepare_bert_data(texts, labels, tokenizer, max_length=512):
    encoded = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )
    
    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask']
    }, labels

In [None]:
def train_bert_model(model, train_data, train_labels, val_data, val_labels, 
                     batch_size=16, epochs=3):
    
    history = model.fit(
        train_data,
        train_labels,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(val_data, val_labels),
        verbose=1
    )
    
    return history

In [None]:
model, tokenizer = create_fine_tuned_bert(num_classes)

train_bert_data, y_train_bert = prepare_bert_data(
    X_train, y_train_cat, tokenizer, max_length=256
)

val_bert_data, y_val_bert = prepare_bert_data(
    X_val, y_val_cat, tokenizer, max_length=256
)

history = train_bert_model(
    model, 
    train_bert_data, y_train_bert,
    val_bert_data, y_val_bert,
    batch_size=16,
    epochs=3
)

2025-07-27 20:01:01.051149: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 177856516 (678.47 MB)
Trainable params: 177856516 (678.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/3
  2/475 [..............................] - ETA: 2:24:12 - loss: 4.6042 - accuracy: 0.2188

: 

: 

: 

In [28]:
y_test_cat = to_categorical(y_test, num_classes=num_classes)

test_bert_data, y_test_bert = prepare_bert_data(
    X_test, y_test_cat, tokenizer, max_length=128
)

predictions = model.predict(test_bert_data)



In [37]:
import numpy as np

y_pred = np.argmax(predictions[0], axis=1)
