In [1]:
!pip install Arabic-Stopwords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Arabic-Stopwords
  Downloading Arabic_Stopwords-0.3-py3-none-any.whl (353 kB)
[K     |████████████████████████████████| 353 kB 11.4 MB/s 
[?25hCollecting pyarabic>=0.6.2
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 43.7 MB/s 
Installing collected packages: pyarabic, Arabic-Stopwords
Successfully installed Arabic-Stopwords-0.3 pyarabic-0.6.15


In [2]:
from sklearn.metrics import classification_report ,confusion_matrix ,accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.tokenize import TreebankWordTokenizer
from keras.preprocessing.text import Tokenizer
import arabicstopwords.arabicstopwords as stp
from keras.models import Sequential
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from keras import layers
import seaborn as sns
import pandas as pd
import collections
import numpy as np
import matplotlib
import argparse
import codecs
import string
import pickle
import nltk
import math 
import sys
import re
import os
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
root_path = '/content/drive/MyDrive/Colab Notebooks/Tasks/questions classification by topic'
data_path = '/content/drive/MyDrive/Colab Notebooks/Tasks/questions classification by topic/DataSets'
results_path = '/content/drive/MyDrive/Colab Notebooks/Tasks/questions classification by topic/results'
results_path_deep = '/content/drive/MyDrive/Colab Notebooks/Tasks/questions classification by topic/result_with_deep_learning'

# Dataset Preparation (pre-processing)

In [5]:
arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

In [6]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

In [7]:
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

In [8]:
def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [9]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [10]:
def clean_text(text):
    text = remove_diacritics(text)
    text = remove_punctuations(text)
    text = remove_repeating_char(text)
    text = re.sub(r"([@A-Za-z0-9_ـــــــــــــ]+)|[^\w\s]|#|http\S+", " ", text)
    text = " ".join([word for word in text.split() if not stp.is_stop(u"{}".format(word))])
    text = " ".join([word for word in text.split() if len(word)>2])
    return text

In [11]:
def tfidf_vectorizer(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

In [12]:
def count_vectorizer(data): # Bag of words (BOW)  , binary=True
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

In [13]:
def pad_sequences_vectorizer(X_train):
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
    X_train = pad_sequences(X_train, padding='post', maxlen=10000)
    input_Dim = X_train[0].shape
    return X_train,vocab_size,input_Dim,tokenizer

# Save_Load Results

In [14]:
def save_object(obj, filename):
    filename = os.path.join(results_path,filename)
    with open(filename+".pkl", 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)
    outp.close()
def load_object(filename):
    filename = os.path.join(results_path,filename)
    with open(filename+".pkl", 'rb') as outp:
        loaded_object = pickle.load(outp)
    outp.close()
    return loaded_object

In [15]:
seq_Container_Result = load_object('seq_Container_Result_v2')
save_model = load_object('save_model_v2')

In [16]:
vectorizers = load_object('vectorizers_v2')

# Select best model 

In [22]:
best_models = load_object('best_models_v2')

In [17]:
labels = load_object('labels')
labels

['التراجم والأخبار',
 'فقه',
 'التاريخ و السيرة النبوية',
 'العلم و الدعوة',
 'الحديث الشريف',
 'القرآن و علومه',
 'مشكلات اجتماعية وفكرية',
 'التربية و الآداب',
 'الفقه وفروعه',
 'السياسة الشرعية',
 'العقيدة الإسلامية',
 'الأذكار والأدعية',
 'الأطعمة والأشربة والصيد']

In [23]:
def Test(x):
  best_pre = []
  x = clean_text(x)
  for m,model in enumerate(best_models):
    tx = vectorizers[m].transform([x])
    if model.predict(tx)[0]==1:
      print("Result Model :\t"+str(labels[m])+"\t<=\t",max(model.predict_proba(tx)[0]))
      best_pre.append(max(model.predict_proba(tx)[0]))
    else:
      print("Result Model :\t"+str(labels[m])+"\t<=\t",min(model.predict_proba(tx)[0]))
      best_pre.append(min(model.predict_proba(tx)[0]))
  print()
  print("***** predict :\t",labels[best_pre.index(max(best_pre))],"\t<=\t",max(best_pre))

In [31]:
Test('دعاء عند الخروج من المنزل  ؟')

Result Model :	التراجم والأخبار	<=	 0.03960417475762865
Result Model :	فقه	<=	 0.04488719406571747
Result Model :	التاريخ و السيرة النبوية	<=	 0.0554428957944821
Result Model :	العلم و الدعوة	<=	 0.08975639700913961
Result Model :	الحديث الشريف	<=	 0.1022298366652542
Result Model :	القرآن و علومه	<=	 0.018894402531317956
Result Model :	مشكلات اجتماعية وفكرية	<=	 0.7926679283970166
Result Model :	التربية و الآداب	<=	 0.6178892839158834
Result Model :	الفقه وفروعه	<=	 0.05598814657026193
Result Model :	السياسة الشرعية	<=	 0.475435766621294
Result Model :	العقيدة الإسلامية	<=	 0.10096678930232582
Result Model :	الأذكار والأدعية	<=	 0.987479313516036
Result Model :	الأطعمة والأشربة والصيد	<=	 0.9504697666436251

***** predict :	 الأذكار والأدعية 	<=	 0.987479313516036
