In [15]:
!pip install numpy pandas scikit-learn sentence-transformers bertopic arabic-reshaper python-bidi

Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: arabic-reshaper, python-bidi
Successfully installed arabic-reshaper-3.0.0 python-bidi-0.4.2


In [16]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import adjusted_mutual_info_score
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import arabic_reshaper
from bidi.algorithm import get_display
import nltk
from sklearn.cluster import KMeans

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
# Preprocessing functions
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    arabic_stopwords = set(stopwords.words('arabic'))
    words = text.split()
    filtered_words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(filtered_words)

In [19]:
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    df.dropna(subset=['text', 'targe'], inplace=True)
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    return df

In [20]:
def get_stratified_sample(df, sample_size=1000):
    return df.groupby('targe', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size // 5))).reset_index(drop=True)

In [21]:
file_path = '/content/drive/MyDrive/Colab Notebooks/arabic_dataset_classifiction.csv'
df = load_dataset(file_path)

In [None]:
# renaming the target class
df.rename(columns={'targe': 'target'}, inplace=True)

In [None]:
df.head()

Unnamed: 0,text,target,cleaned_text
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,0,أستوديوهات ورزازات وصحراء مرزوكة وآثار وليلي ا...
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,0,قررت النجمة الأمريكية أوبرا وينفري يقتصر عملها...
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,0,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,0,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,0,تزال صناعة الجلود المغرب تتبع الطريقة التقليدي...


In [None]:
# Example usage
sample_df = get_stratified_sample(df)
articles = sample_df['cleaned_text'].tolist()
true_labels = sample_df['target'].tolist()
print(f"articles: \n{len(articles)}")
print(f"true_labels: \n{len(true_labels)}")

articles: 
1000
true_labels: 
1000


In [None]:
# Calculate AMI score
def calculate_ami(true_labels, predicted_labels):
    if len(true_labels) != len(predicted_labels):
        min_length = min(len(true_labels), len(predicted_labels))
        true_labels = true_labels[:min_length]
        predicted_labels = predicted_labels[:min_length]
    return adjusted_mutual_info_score(true_labels, predicted_labels)

In [None]:
def lda_topic_modeling(articles, num_topics=5):
    vectorizer = CountVectorizer()
    data_vectorized = vectorizer.fit_transform(articles)
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda_topics = lda_model.fit_transform(data_vectorized)
    return np.argmax(lda_topics, axis=1)

In [None]:
# LDA Topic Modeling
lda_topics = lda_topic_modeling(articles, num_topics=5)
print(f"lda_topics: \n{lda_topics}")
lda_ami = calculate_ami(true_labels, lda_topics)
print(f'LDA AMI: {lda_ami}')

lda_topics: 
[3 4 0 1 1 3 4 4 1 3 0 1 2 0 0 3 3 0 0 1 4 3 4 1 2 4 0 1 3 1 0 4 0 1 3 2 2
 0 1 2 4 3 2 0 2 1 1 1 0 1 2 3 0 4 1 4 1 3 4 0 4 1 3 0 0 3 2 3 1 0 4 1 2 4
 0 0 1 0 1 3 3 2 0 1 4 1 2 2 0 0 0 1 4 0 1 2 0 4 0 4 1 4 1 1 3 4 1 4 2 1 3
 3 0 4 4 3 1 1 1 1 0 3 2 2 2 3 4 1 3 1 2 1 1 2 0 4 3 2 2 3 3 1 3 4 3 2 3 0
 1 3 4 3 1 0 2 1 4 2 3 3 3 1 2 2 1 3 3 0 1 1 4 2 3 4 2 0 2 0 3 1 2 1 3 1 1
 0 1 4 1 1 2 2 1 1 1 0 3 0 1 0 1 1 1 2 4 1 1 1 0 1 1 1 1 1 3 4 1 1 1 1 1 1
 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 4 1
 1 1 1 1 1 1 0 4 1 1 1 3 1 1 1 1 1 1 1 1 3 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 4 3 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 2 1 1 1 4 1 1 1 0 1 1 1 1
 4 1 1 2 4 1 4 2 1 0 1 1 2 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 4 1 1
 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 0 1 3 4 1 1 1 1 1 1 1 1 1 1 1 3 1 3 0 3 3 3
 0 0 4 3 1 0 1 0 0 1 2 1 0 4 4 0 0 2 2 0 2 3 4 1 2 2 1 0 0 3 1 4 4 4 3 2 4
 1 1 0 3 1 3 0 0 2 4 4 2 4 3 0 2 4 1 4 0 0 4 0 3 2 2 1 3 0 4 3 4 2 4 0 4 1
 0 4 1 3 4 0

In [None]:
# def bert_topic_modeling(articles, num_topics=5):
#     model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
#     embeddings = model.encode(articles, show_progress_bar=True)
#     #topic_model = BERTopic(language="arabic", nr_topics=num_topics)
#     topic_model = BERTopic(language="multilingual", nr_topics=num_topics)
#     topics, _ = topic_model.fit_transform(articles, embeddings)
#     return topics

In [None]:
def bert_topic_modeling(articles, num_topics=5):
    try:
        model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        embeddings = model.encode(articles, show_progress_bar=True)

        # Custom KMeans clustering model
        kmeans = KMeans(n_clusters=num_topics, random_state=0)

        # Create BERTopic model with custom clustering
        topic_model = BERTopic(embedding_model=model, umap_model=None, hdbscan_model=kmeans)
        topics, probs = topic_model.fit_transform(articles, embeddings)
        return topics
    except Exception as e:
        print(f"Error occurred: {e}")
        print("Documents causing error:")
        for article in articles:
            print(article)
        return None

In [None]:
# BERT Topic Modeling
bert_topics = bert_topic_modeling(articles, num_topics=5)
print(f"bert_topics: \n{bert_topics}")
bert_ami = calculate_ami(true_labels, bert_topics)
print(f'BERT AMI: {bert_ami}')

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

bert_topics: 
[1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 4, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 3, 3, 4, 1, 1, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 2, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 0, 1, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 3, 3, 1, 1, 1, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 0, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 3, 1, 1, 1, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 4, 3, 4, 4, 3, 4, 0, 4, 4, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 3, 4, 3, 4, 4, 4, 4, 4, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4, 1, 4, 4, 4, 4, 4, 3, 0, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 3, 3, 4, 3, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3

In [None]:
def nmf_topic_modeling(articles, num_topics=5):
    vectorizer = TfidfVectorizer()
    data_vectorized = vectorizer.fit_transform(articles)
    nmf_model = NMF(n_components=num_topics, random_state=0)
    nmf_topics = nmf_model.fit_transform(data_vectorized)
    return np.argmax(nmf_topics, axis=1)

In [None]:
# NMF Topic Modeling
nmf_topics = nmf_topic_modeling(articles, num_topics=5)
print(f"nmf_topics: \n{nmf_topics}")
nmf_ami = calculate_ami(true_labels, nmf_topics)
print(f'NMF AMI: {nmf_ami}')

nmf_topics: 
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 3 4 4 4 4 4 4 0 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 4 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 3 4 3 4
 4 4 4 4 4 4 3 3 3 3 4 3 4 4 4 3 3 0 0 4 3 4 3 4 4 3 4 4 4 3 3 4 4 4 4 4 4
 3 4 0 3 4 4 4 3 4 0 4 4 4 3 4 3 4 3 3 4 4 4 4 3 3 3 4 4 4 3 4 3 4 4 4 0 3
 4 4 4 4 3 3

#NPMI Measure

## installing/importing libraries

In [2]:
!pip install flair
!pip install bertopic[all]

Collecting flair
  Downloading flair-0.13.1-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.34.131-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bpemb>=0.3.2 (from flair)
  Downloading bpemb-0.3.5-py3-none-any.whl (19 kB)
Collecting conllu>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting deprecated>=1.2.13 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting janome>=0.4.2 (from flair)
  Downloading Janome-0.5.0-py2.py3-none-any.whl (19

In [22]:
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [23]:
# Loading Data
from google.colab import drive
drive.mount('/content/drive')

# read data
file_path =  "/content/drive/MyDrive/Colab Notebooks/arabic_dataset_classifiction.csv"
data = load_dataset(file_path)
data.rename(columns={"targe":"target"})
data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,text,targe,cleaned_text
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,0,أستوديوهات ورزازات وصحراء مرزوكة وآثار وليلي ا...
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,0,قررت النجمة الأمريكية أوبرا وينفري يقتصر عملها...
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,0,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,0,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,0,تزال صناعة الجلود المغرب تتبع الطريقة التقليدي...


## Getting data as documents

In [24]:
# view data shape
data["cleaned_text"] = data["cleaned_text"].astype(str)
data.shape

(108789, 3)

In [25]:
data.head()

Unnamed: 0,text,targe,cleaned_text
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,0,أستوديوهات ورزازات وصحراء مرزوكة وآثار وليلي ا...
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,0,قررت النجمة الأمريكية أوبرا وينفري يقتصر عملها...
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,0,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,0,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,0,تزال صناعة الجلود المغرب تتبع الطريقة التقليدي...


In [27]:
# take the text as documents
samp = data
documents = samp['cleaned_text'].values
type(documents)

numpy.ndarray

## BERTopic

In [9]:
#To experiment with other BERT models simply change the model name below
arabert = TransformerDocumentEmbeddings('aubmindlab/bert-base-arabertv02')

tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [10]:
# create topic model
topic_model = BERTopic(language="arabic", low_memory=True ,calculate_probabilities=False,
                     embedding_model=arabert)


In [None]:
# fit the model using the cleaned documents
topics, probs = topic_model.fit_transform(documents)

In [None]:
#extract most frequent topics
topic_model.get_topic_freq().head(5)

In [None]:
#show the top 10 words in topic 1
topic_model.get_topic(1)

### Evaluation

To evaluate the model topics coherence we use Gensim implementation of the Normalized Pointwise Mutual Information (NPMI).

In [None]:
texts = [[word for word in str(document).split()] for document in documents]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
topics=[]
for i in topic_model.get_topics():
  row=[]
  topic= topic_model.get_topic(i)
  for word in topic:
     row.append(word[0])
  topics.append(row)

In [None]:
# compute Coherence Score

cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)

## NMF

In [28]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(documents)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() # Use get_feature_names_out() instead of get_feature_names()


In [29]:
#chang the number of topics here
no_topics = 5

# run NMF
nmf = NMF(n_components=no_topics, random_state=1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [33]:
topics_NMF=[]
for index, topic in enumerate(nmf.components_):
    row=[]
    for i in topic.argsort()[-10:]:
      row.append(tfidf_vectorizer.get_feature_names_out()[i])
    topics_NMF.append(row)

In [34]:
cm = CoherenceModel(topics=topics_NMF, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence_nmf = cm.get_coherence()
print('\nCoherence Score: ', coherence_nmf)


Coherence Score:  0.1771930309362741


## LDA

In [35]:
#chang the number of topics here
no_topics = 5

# run LDA
lda = LdaMulticore(corpus, id2word=id2word, num_topics=no_topics)


  self.pid = os.fork()


In [36]:
#compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=texts, dictionary=id2word, coherence='c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.05817654209700832
