In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import re
import pickle
import os
from tqdm import tqdm
import ast

# For BERT embeddings
import torch
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x240d151d790>

In [2]:
def rebuild_sentence(tokens):
    # tokens doit être une liste de strings
    if isinstance(tokens, list):
        return ' '.join(tokens)
    else:
        # cas de tokens mal formé : on renvoie chaîne vide
        return ''

In [3]:
df= pd.read_csv(r"C:\Users\hp\Desktop\pfemaster\dataset\dataset_after_preprocessing.csv", sep=',', encoding='utf-8')
df['tokens_list'] = df['tokens_no_stopwords'].apply(ast.literal_eval)

df['text_joined'] = df['tokens_list'].apply(rebuild_sentence)



In [4]:
df.head()

Unnamed: 0,tokens_no_stopwords,Dialect,tokens_list,text_joined
0,"['اليكم', 'جديد', 'واخيرا', 'درت', 'لهاد', 'ال...",1,"[اليكم, جديد, واخيرا, درت, لهاد, الصفحه]",اليكم جديد واخيرا درت لهاد الصفحه
1,"['الله', 'يخلف', 'سيدي', 'محمد']",1,"[الله, يخلف, سيدي, محمد]",الله يخلف سيدي محمد
2,"['هجومك', 'اخي', 'علي', 'حسن', 'طارق', 'تبرهيش...",1,"[هجومك, اخي, علي, حسن, طارق, تبرهيش, قله, عقل,...",هجومك اخي علي حسن طارق تبرهيش قله عقل ودليل ان...
3,"['امين', 'عام', 'حزب', 'سياسي', 'كينشر', 'تدوي...",1,"[امين, عام, حزب, سياسي, كينشر, تدوينه, وكيدير,...",امين عام حزب سياسي كينشر تدوينه وكيدير ليها
4,"['اجي', 'فين', 'غبرات', 'الحمي', 'القلاعيه', '...",1,"[اجي, فين, غبرات, الحمي, القلاعيه, شفت, بقات, ...",اجي فين غبرات الحمي القلاعيه شفت بقات هضره عليها


In [5]:
 #Créer les variables
X = df['tokens_list'].values         # Pour BoW, TF-IDF → Token list
X1 = df['text_joined'].values        # Pour Word2Vec, GloVe, BERT → Texte brut recomposé
y = df['Dialect'].values             # Target identique pour les deux

# Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X1_train, X1_test, _, _ = train_test_split(
    X1, y, test_size=0.2, random_state=42  # Même random_state pour cohérence
)

 ===== TF-IDF VECTORIZATION =====

In [6]:
def create_tfidf_vectors(X_train, X_test, max_features=10000, ngram_range=(1, 2), save_path=None):
  
    print(f"Creating TF-IDF vectors with {max_features} features, ngram_range={ngram_range}...")
    
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, 
                                       ngram_range=ngram_range,
                                       min_df=5)
    
    # Fit and transform training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    # Transform testing data
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    print(f"TF-IDF vectors created. Training shape: {X_train_tfidf.shape}, Testing shape: {X_test_tfidf.shape}")
    
    # Save the vectorizer if path is provided
    if save_path:
        with open(save_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)
        print(f"TF-IDF vectorizer saved to {save_path}")
    
    return X_train_tfidf, X_test_tfidf, tfidf_vectorizer

==== WORD2VEC VECTORIZATION ====

In [7]:
def create_word2vec_model(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, save_path=None):
    
    print(f"Training Word2Vec model with vector_size={vector_size}, window={window}...")
    
    # Train Word2Vec model
    model = Word2Vec(sentences=tokenized_texts,
                     vector_size=vector_size,
                     window=window,
                     min_count=min_count,
                     workers=workers)
    
    print(f"Word2Vec model trained. Vocabulary size: {len(model.wv.key_to_index)}")
    
    # Save the model if path is provided
    if save_path:
        model.save(save_path)
        print(f"Word2Vec model saved to {save_path}")
    
    return model

In [8]:
def create_document_vectors_w2v(tokenized_texts, word2vec_model, vector_size=100):

    document_vectors = np.zeros((len(tokenized_texts), vector_size))

    for i, tokens in enumerate(tqdm(tokenized_texts)):
        vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
        if vectors:
            document_vectors[i] = np.mean(vectors, axis=0)

    return document_vectors

===== GLOVE VECTORIZATION =====


In [9]:
def load_glove_model(glove_path):
   
    print(f"Loading GloVe embeddings from {glove_path}...")
    glove_model = {}
    vector_size = None
    
    with open(glove_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            split_line = line.split()
            word = split_line[0]
            
            # Get embedding dimension from first vector
            if vector_size is None:
                vector_size = len(split_line) - 1
                
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    
    print(f"GloVe embeddings loaded. Vocabulary size: {len(glove_model)}, Vector size: {vector_size}")
    return glove_model, vector_size

In [10]:


def create_document_vectors_glove(tokenized_texts, glove_model, vector_size):
    
    document_vectors = np.zeros((len(tokenized_texts), vector_size))

    for i, tokens in enumerate(tqdm(tokenized_texts)):
        vectors = []
        for token in tokens:
            if token in glove_model:
                vectors.append(glove_model[token])
        
        if vectors:
            document_vectors[i] = np.mean(vectors, axis=0)

    return document_vectors


===== BERT EMBEDDINGS =====

In [11]:


def get_bert_embeddings_from_tokens(tokenized_texts, model_name='UBC-NLP/MARBERTv2', batch_size=32, max_length=128):

    print(f"Generating BERT embeddings using {model_name} from tokenized texts...")

    # Charger le tokenizer et le modèle
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    all_embeddings = []

    for i in tqdm(range(0, len(tokenized_texts), batch_size)):
        batch_tokens = tokenized_texts[i:i + batch_size]

        # Convertir les listes de tokens en chaînes de texte
        batch_texts = [" ".join(tokens) for tokens in batch_tokens]

        encoded_inputs = tokenizer(
            batch_texts,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )

        encoded_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}

        with torch.no_grad():
            outputs = model(**encoded_inputs)

        # Utiliser l'embedding du token [CLS]
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)

    embeddings = np.vstack(all_embeddings)
    print(f"BERT embeddings created with shape: {embeddings.shape}")
    
    return embeddings


 ===== ARABIC-SPECIFIC BERT MODELS =====


In [12]:
def get_bert_embeddings_per_dialect(tokenized_texts, dialects, model_map, batch_size=32, max_length=128):

    print("Génération des embeddings BERT selon le dialecte...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Préparer les modèles par dialecte
    models = {}
    tokenizers = {}

    for dialect, model_name in model_map.items():
        print(f"Chargement du modèle pour {dialect} → {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).to(device).eval()
        tokenizers[dialect] = tokenizer
        models[dialect] = model

    embeddings = []

    for i in tqdm(range(0, len(tokenized_texts), batch_size)):
        batch_tokens = tokenized_texts[i:i+batch_size]
        batch_dialects = dialects[i:i+batch_size]

        batch_embeddings = []

        for tokens, dialect in zip(batch_tokens, batch_dialects):
            text = " ".join(tokens)
            tokenizer = tokenizers[dialect]
            model = models[dialect]

            encoded = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)

            with torch.no_grad():
                output = model(**encoded)
                cls_embedding = output.last_hidden_state[:, 0, :].cpu().numpy().squeeze()
                batch_embeddings.append(cls_embedding)

        embeddings.extend(batch_embeddings)

    embeddings = np.array(embeddings)
    print(f"Embeddings shape: {embeddings.shape}")
    return embeddings


===== UTILITY FUNCTIONS =====


In [13]:
def save_vectors(vectors, file_path):
   
    with open(file_path, 'wb') as f:
        pickle.dump(vectors, f)
    print(f"Vectors saved to {file_path}")

def load_vectors(file_path):
   
    with open(file_path, 'rb') as f:
        vectors = pickle.load(f)
    print(f"Vectors loaded from {file_path} with shape: {vectors.shape}")
    return vectors

1. TF-IDF Vectorization

In [16]:
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = create_tfidf_vectors(
    X1_train, X1_test, max_features=10000, save_path="tfidf_vectorizer.pkl"
)
save_vectors(X_train_tfidf, "X_train_tfidf.pkl")
save_vectors(X_test_tfidf, "X_test_tfidf.pkl")

Creating TF-IDF vectors with 10000 features, ngram_range=(1, 2)...
TF-IDF vectors created. Training shape: (19761, 5652), Testing shape: (4941, 5652)
TF-IDF vectorizer saved to tfidf_vectorizer.pkl
Vectors saved to X_train_tfidf.pkl
Vectors saved to X_test_tfidf.pkl


2. Word2Vec Embeddings

In [15]:
w2v_model = create_word2vec_model(
    X_train, vector_size=100, save_path="models/word2vec.model"
)
X_train_w2v = create_document_vectors_w2v(X_train, w2v_model, vector_size=100)
X_test_w2v = create_document_vectors_w2v(X_test, w2v_model, vector_size=100)
save_vectors(X_train_w2v, "vectors/X_train_w2v.pkl")
save_vectors(X_test_w2v, "vectors/X_test_w2v.pkl")

Training Word2Vec model with vector_size=100, window=5...
Word2Vec model trained. Vocabulary size: 42421
Word2Vec model saved to models/word2vec.model


100%|██████████| 19761/19761 [00:00<00:00, 20381.87it/s]
100%|██████████| 4941/4941 [00:00<00:00, 21826.42it/s]

Vectors saved to vectors/X_train_w2v.pkl
Vectors saved to vectors/X_test_w2v.pkl





3. GloVe Embeddings

In [18]:
# Load pre-trained GloVe embeddings
glove_model, vector_size = load_glove_model("glove.6B.200d.txt")
# Create document vectors

X_train_glove = create_document_vectors_glove(X_train, glove_model, vector_size)
X_test_glove = create_document_vectors_glove(X_test, glove_model, vector_size)

save_vectors(X_train_glove, "vectors/X_train_glove.pkl")
save_vectors(X_test_glove, "vectors/X_test_glove.pkl")

Loading GloVe embeddings from glove.6B.200d.txt...


0it [00:00, ?it/s]

400000it [00:24, 16025.19it/s]


GloVe embeddings loaded. Vocabulary size: 400000, Vector size: 200


100%|██████████| 19761/19761 [00:00<00:00, 113500.68it/s]
100%|██████████| 4941/4941 [00:00<00:00, 109350.23it/s]


Vectors saved to vectors/X_train_glove.pkl
Vectors saved to vectors/X_test_glove.pkl


 4. BERT Embeddings


In [19]:
# Standard multilingual BERT

X_train_bert2 = get_bert_embeddings_from_tokens(X_train, model_name="UBC-NLP/MARBERTv2")
X_test_bert2 = get_bert_embeddings_from_tokens(X_test, model_name="UBC-NLP/MARBERTv2")

save_vectors(X_train_bert2, "vectors/X_train_bert.pkl")
save_vectors(X_test_bert2, "vectors/X_test_bert.pkl")

Generating BERT embeddings using UBC-NLP/MARBERTv2 from tokenized texts...


100%|██████████| 618/618 [06:50<00:00,  1.50it/s]


BERT embeddings created with shape: (19761, 768)
Generating BERT embeddings using UBC-NLP/MARBERTv2 from tokenized texts...


100%|██████████| 155/155 [01:34<00:00,  1.63it/s]

BERT embeddings created with shape: (4941, 768)
Vectors saved to vectors/X_train_bert.pkl
Vectors saved to vectors/X_test_bert.pkl





5. Arabic-specific BERT



In [None]:
# Exemple : mapping de chaque dialecte à un modèle BERT spécifique
model_map = {
    1: "SI2M-Lab/DarijaBERT",
    0: "alger-ia/dziribert",
    2: "tunis-ai/TunBERT"
}


# Génère les embeddings
X_train_bert = get_bert_embeddings_per_dialect(X_train, y_train, model_map)
X_test_bert = get_bert_embeddings_per_dialect(X_test, y_test, model_map)
save_vectors(X_train_bert, "vectors/X_train_bert.pkl")
save_vectors(X_test_bert, "vectors/X_test_bert.pkl")



Génération des embeddings BERT selon le dialecte...
Chargement du modèle pour 1 → SI2M-Lab/DarijaBERT


The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Some weights of BertModel were not initialized from the model checkpoint at SI2M-Lab/DarijaBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chargement du modèle pour 0 → alger-ia/dziribert


Some weights of BertModel were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chargement du modèle pour 2 → tunis-ai/TunBERT


In [None]:
save_vectors(y_train, "vectors/y_train.pkl")
save_vectors(y_test, "vectors/y_test.pkl")


Vectors saved to vectors/y_train.pkl
Vectors saved to vectors/y_test.pkl


In [None]:
y_train.shape

In [None]:
y_test.shape