In [63]:
# %pip install arabert
# %pip install tashaphyne
import numpy as np
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from arabert.preprocess import ArabertPreprocessor

In [64]:
#reading data
ar_harass=pd.read_csv('ar_dataset.csv')

In [65]:
ar_harass.head(5)

Unnamed: 0,HITId,tweet,sentiment,directness,annotator_sentiment,target,group
0,1,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful_normal,indirect,shock,gender,individual
1,2,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive,indirect,anger_confusion_sadness_indifference_disgust,other,other
2,3,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive,indirect,indifference,other,individual
3,4,@user @user انا اوافقك بخصوص السوريين و العراق...,normal,direct,indifference,origin,other
4,5,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal,indirect,indifference,origin,other


In [66]:
ar_harass=ar_harass.drop(['directness','annotator_sentiment','target','group','HITId'],axis=1) #drop unnecessary cols

In [67]:
ar_harass = ar_harass[ar_harass.sentiment != 'fearful']

In [68]:
ar_harass.head()

Unnamed: 0,tweet,sentiment
0,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,hateful_normal
1,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive
2,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive
3,@user @user انا اوافقك بخصوص السوريين و العراق...,normal
4,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal


In [69]:
z=list(ar_harass.sentiment.drop_duplicates())

In [70]:
z

['hateful_normal',
 'offensive',
 'normal',
 'offensive_disrespectful',
 'offensive_normal',
 'hateful',
 'abusive_disrespectful',
 'abusive_hateful',
 'disrespectful',
 'abusive',
 'disrespectful_normal',
 'abusive_offensive',
 'abusive_normal',
 'offensive_hateful',
 'abusive_offensive_hateful_disrespectful_normal',
 'fearful_disrespectful_hateful_normal',
 'abusive_offensive_disrespectful_hateful_normal',
 'fearful_abusive_offensive_hateful_disrespectful',
 'fearful_abusive_disrespectful_hateful_normal',
 'fearful_abusive_offensive_disrespectful_normal',
 'hateful_disrespectful',
 'abusive_offensive_hateful_normal',
 'fearful_offensive_disrespectful_hateful_normal',
 'abusive_offensive_hateful_disrespectful',
 'fearful_abusive_hateful_disrespectful_normal',
 'fearful_normal',
 'fearful_offensive_hateful_normal',
 'abusive_disrespectful_hateful_normal',
 'abusive_offensive_disrespectful_normal',
 'hateful_disrespectful_normal',
 'fearful_disrespectful',
 'fearful_abusive_offensive_ha

In [71]:
z.remove('normal')

In [72]:
ar_harass['sentiment']=ar_harass.sentiment.replace(z, 'offensive')

In [74]:
ar_harass.head(14)

Unnamed: 0,tweet,sentiment
0,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,offensive
1,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive
2,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive
3,@user @user انا اوافقك بخصوص السوريين و العراق...,normal
4,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal
5,ما عارف كنت شايفها منحرفة شديد وانا صغير ليه @url,offensive
6,@user @user تخيل يرد عليك يقولك حسب معلوماتنا ...,offensive
7,صباح التوكل على الله والسعي على رزقه والثقة في...,normal
8,وانتي مال امك يا مطلقة يا بايرة ياوش قنوات @url,offensive
9,@user @user الكردي كلب من يوم يومه بحاجة دائما...,offensive


In [75]:
ar_harass['sentiment'].value_counts()

sentiment
offensive    2426
normal        915
Name: count, dtype: int64

In [76]:
#check missing values
ar_harass.isna().sum()

tweet        0
sentiment    0
dtype: int64

In [77]:
#data shape
ar_harass.shape

(3341, 2)

In [78]:
ar_harass.head()

Unnamed: 0,tweet,sentiment
0,صلاة الفجر خير لك من ترديد بول البعير وسبي الن...,offensive
1,صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج...,offensive
2,طيب! هي متبرجة وعبايتها ملونه وطالعة من بيتهم ...,offensive
3,@user @user انا اوافقك بخصوص السوريين و العراق...,normal
4,هذه السعودية التي شعبها شعب الخيم و بول البعير...,normal


In [79]:
# #text preprocessing
# import nltk
# nltk.download('all')

In [80]:
#%pip install camel-tools
# %pip install textblob

In [81]:
#Text standarisation
from nltk.corpus import stopwords
from textblob import TextBlob
import re
from tashaphyne.stemming import ArabicLightStemmer
from nltk.stem.isri import ISRIStemmer

stops = set(stopwords.words("arabic"))
stop_word_comp = {"،","آض","آمينَ","آه","آهاً","آي","أ","أب","أجل","أجمع","أخ","أخذ","أصبح","أضحى","أقبل","أقل","أكثر","ألا","أم","أما","أمامك","أمامكَ","أمسى","أمّا","أن","أنا","أنت","أنتم","أنتما","أنتن","أنتِ","أنشأ","أنّى","أو","أوشك","أولئك","أولئكم","أولاء","أولالك","أوّهْ","أي","أيا","أين","أينما","أيّ","أَنَّ","أََيُّ","أُفٍّ","إذ","إذا","إذاً","إذما","إذن","إلى","إليكم","إليكما","إليكنّ","إليكَ","إلَيْكَ","إلّا","إمّا","إن","إنّما","إي","إياك","إياكم","إياكما","إياكن","إيانا","إياه","إياها","إياهم","إياهما","إياهن","إياي","إيهٍ","إِنَّ","ا","ابتدأ","اثر","اجل","احد","اخرى","اخلولق","اذا","اربعة","ارتدّ","استحال","اطار","اعادة","اعلنت","اف","اكثر","اكد","الألاء","الألى","الا","الاخيرة","الان","الاول","الاولى","التى","التي","الثاني","الثانية","الذاتي","الذى","الذي","الذين","السابق","الف","اللائي","اللاتي","اللتان","اللتيا","اللتين","اللذان","اللذين","اللواتي","الماضي","المقبل","الوقت","الى","اليوم","اما","امام","امس","ان","انبرى","انقلب","انه","انها","او","اول","اي","ايار","ايام","ايضا","ب","بات","باسم","بان","بخٍ","برس","بسبب","بسّ","بشكل","بضع","بطآن","بعد","بعض","بك","بكم","بكما","بكن","بل","بلى","بما","بماذا","بمن","بن","بنا","به","بها","بي","بيد","بين","بَسْ","بَلْهَ","بِئْسَ","تانِ","تانِك","تبدّل","تجاه","تحوّل","تلقاء","تلك","تلكم","تلكما","تم","تينك","تَيْنِ","تِه","تِي","ثلاثة","ثم","ثمّ","ثمّة","ثُمَّ","جعل","جلل","جميع","جير","حار","حاشا","حاليا","حاي","حتى","حرى","حسب","حم","حوالى","حول","حيث","حيثما","حين","حيَّ","حَبَّذَا","حَتَّى","حَذارِ","خلا","خلال","دون","دونك","ذا","ذات","ذاك","ذانك","ذانِ","ذلك","ذلكم","ذلكما","ذلكن","ذو","ذوا","ذواتا","ذواتي","ذيت","ذينك","ذَيْنِ","ذِه","ذِي","راح","رجع","رويدك","ريث","رُبَّ","زيارة","سبحان","سرعان","سنة","سنوات","سوف","سوى","سَاءَ","سَاءَمَا","شبه","شخصا","شرع","شَتَّانَ","صار","صباح","صفر","صهٍ","صهْ","ضد","ضمن","طاق","طالما","طفق","طَق","ظلّ","عاد","عام","عاما","عامة","عدا","عدة","عدد","عدم","عسى","عشر","عشرة","علق","على","عليك","عليه","عليها","علًّ","عن","عند","عندما","عوض","عين","عَدَسْ","عَمَّا","غدا","غير","ـ","ف","فان","فلان","فو","فى","في","فيم","فيما","فيه","فيها","قال","قام","قبل","قد","قطّ","قلما","قوة","كأنّما","كأين","كأيّ","كأيّن","كاد","كان","كانت","كذا","كذلك","كرب","كل","كلا","كلاهما","كلتا","كلم","كليكما","كليهما","كلّما","كلَّا","كم","كما","كي","كيت","كيف","كيفما","كَأَنَّ","كِخ","لئن","لا","لات","لاسيما","لدن","لدى","لعمر","لقاء","لك","لكم","لكما","لكن","لكنَّما","لكي","لكيلا","للامم","لم","لما","لمّا","لن","لنا","له","لها","لو","لوكالة","لولا","لوما","لي","لَسْتَ","لَسْتُ","لَسْتُم","لَسْتُمَا","لَسْتُنَّ","لَسْتِ","لَسْنَ","لَعَلَّ","لَكِنَّ","لَيْتَ","لَيْسَ","لَيْسَا","لَيْسَتَا","لَيْسَتْ","لَيْسُوا","لَِسْنَا","ما","ماانفك","مابرح","مادام","ماذا","مازال","مافتئ","مايو","متى","مثل","مذ","مساء","مع","معاذ","مقابل","مكانكم","مكانكما","مكانكنّ","مكانَك","مليار","مليون","مما","ممن","من","منذ","منها","مه","مهما","مَنْ","مِن","نحن","نحو","نعم","نفس","نفسه","نهاية","نَخْ","نِعِمّا","نِعْمَ","ها","هاؤم","هاكَ","هاهنا","هبّ","هذا","هذه","هكذا","هل","هلمَّ","هلّا","هم","هما","هن","هنا","هناك","هنالك","هو","هي","هيا","هيت","هيّا","هَؤلاء","هَاتانِ","هَاتَيْنِ","هَاتِه","هَاتِي","هَجْ","هَذا","هَذانِ","هَذَيْنِ","هَذِه","هَذِي","هَيْهَاتَ","و","و6","وا","واحد","واضاف","واضافت","واكد","وان","واهاً","واوضح","وراءَك","وفي","وقال","وقالت","وقد","وقف","وكان","وكانت","ولا","ولم","ومن","مَن","وهو","وهي","ويكأنّ","وَيْ","وُشْكَانََ","يكون","يمكن","يوم","ّأيّان"}
ArListem = ArabicLightStemmer()


In [82]:
processor=ArabertPreprocessor(model_name='aubmindlab/araelectra-base-discriminator')
processed_text=processor.preprocess(ar_harass['tweet'])

In [83]:
processed_text

'0 صلاة الفجر خير لك من ترديد بول البعير وسبي الن . . 1 صراحة نفسي اشوف ولاد الوسخة اللي قالوا مدرب اج . . 2 طيب ! هي متبرجة وعبايتها ملونه وطالعة من بيتهم . . 3 [مستخدم] [مستخدم] انا اوافقك بخصوص السوريين و العراق . . 4 هذه السعودية التي شعبها شعب الخيم و بول البعير . . . . 3348 [مستخدم] يمشي بخطا ادارتها قيد من الأمريكان ونهب . . 3349 [مستخدم] مهما حصل هندوس عليهم شويه الرويبضه بس نخ . . 3350 الكلب لا يعض اذن اخوه كذابين خنازير [مستخدم] 3351 [مستخدم] لأنه صغير ويكتب في قناة خنازير فلن نقرأ . . 3352 الحريم أجمع افلوس وايفون قديم وواتساب 24 ساعة Name : tweet , Length : 3341 , dtype : object'

In [84]:
def stem(text):
    zen = TextBlob(text)
    words = zen.words
    cleaned = list()
    for w in words:
        ArListem.light_stem(w)
        cleaned.append(ArListem.get_root())
    return " ".join(cleaned)

In [85]:
import pyarabic.araby as araby
def normalizeArabic(text):
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    text = re.sub(r'(.)\1+', r"\1\1", text) # Remove longation
    return araby.strip_tashkeel(text)

In [86]:
def remove_stop_words(text):
    zen = TextBlob(text)
    words = zen.words
    return " ".join([w for w in words if not w in stops and not w in stop_word_comp and len(w) >= 2])

In [87]:
#Deal with Hashtags in a string
def split_hashtag_to_words(tag):
    tag = tag.replace('#','')
    tags = tag.split('_')
    if len(tags) > 1 :

        return tags
    pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
    return pattern.findall(tag)

def clean_hashtag(text):
    words = text.split()
    text = list()
    for word in words:
        if is_hashtag(word):
            text.extend(extract_hashtag(word))
        else:
            text.append(word)
    return " ".join(text)
def is_hashtag(word):
    if word.startswith("#"):
        return True
    else:
        return False
def extract_hashtag(text):

    hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
    word_list = []
    for word in hash_list :
        word_list.extend(split_hashtag_to_words(word))
    return word_list

In [88]:
#Dealing with emojis in a string
with open('emojis.csv','r',encoding='utf-8') as f:
    lines = f.readlines()
    emojis_ar = {}
    for line in lines:
        line = line.strip('\n').split(';')
        emojis_ar.update({line[0].strip():line[1].strip()})

In [89]:
from __future__ import unicode_literals

def remove_emoji(text):
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [90]:
import emoji
def is_emoji(word):
    if word in emojis_ar:
        return True
    else:
        return False

In [91]:
def add_space(text):
    return ''.join(' ' + char if is_emoji(char) else char for char in text).strip()

In [92]:
def clean_tweet(text):
    text = re.sub('#\d+K\d+', ' ', text)  # years like 2K19
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('@[^\s]+',' ',text)
    text = clean_hashtag(text)
    return text

In [93]:
def clean_text(text):
    ## Clean for tweets
    text = clean_tweet(text)
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)  # remove punctuation
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    ## Remove Emojis
    text = remove_emoji(text)
    ## Convert text to lowercases
    text = text.lower()
    ## Arabisy the text
    #text = to_arabic(text)
    ## Remove stop words
    text = remove_stop_words(text)
    ## Remove numbers
    text = re.sub("\d+", " ", text)
    ## Remove Tashkeel
    text = normalizeArabic(text)
    #text = re.sub('\W+', ' ', text)
    text = re.sub('[A-Za-z]+',' ',text)
    text = re.sub(r'\\u[A-Za-z0-9\\]+',' ',text)
    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    #Stemming
    #text = stem(text)
    return text

In [94]:
ar_harass['tweet'] = ar_harass['tweet'].apply(lambda x:clean_text(x))

In [95]:
ar_harass.head()

Unnamed: 0,tweet,sentiment
0,صلاه الفجر خير ترديد بول البعير وسبي النساء وا...,offensive
1,نفسي اشوف ولاد الوسخه اللي قالوا مدرب اجنبي من...,offensive
2,طيب متبرجه وعبايتها ملونه وطالعه بيتهم بدون,offensive
3,انا اوافقك بخصوص السوريين العراقيين بخصوص السع...,normal
4,السعوديه شعبها شعب الخيم بول البعير يستهزا الناس,normal


In [96]:
#train-test-split
#create feature and label sets
x=ar_harass['tweet']
y=ar_harass['sentiment']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.20,random_state=123)

In [97]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
ar_harass['sentiment']=encoder.fit_transform(ar_harass['sentiment'])

In [98]:
ar_harass['sentiment']

0       1
1       1
2       1
3       0
4       0
       ..
3348    1
3349    1
3350    1
3351    1
3352    0
Name: sentiment, Length: 3341, dtype: int32

In [99]:
#feature extraction
#train using bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x_train_cv=cv.fit_transform(x_train)

In [100]:
# #training logistic regression model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(max_iter=10000,multi_class='auto',random_state=123)
model.fit(x_train_cv,y_train)

# #model 2
# from sklearn.tree import DecisionTreeClassifier

# model = DecisionTreeClassifier(random_state=123)
# model.fit(x_train_cv, y_train)

# # model 3
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(n_estimators=100, random_state=123)
# model.fit(x_train_cv, y_train)


In [101]:
# transform x_test using CV
x_test_cv=cv.transform(x_test)

In [102]:
#generate predictions
predictions=model.predict(x_test_cv)

In [103]:
predictions

array(['normal', 'offensive', 'offensive', 'normal', 'normal',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'offensive', 'normal', 'offensive', 'offensive',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'normal', 'offensive', 'offensive', 'normal',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'normal', 'offensive', 'offensive', 'offensive',
       'normal', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'offensive', 'offensive', 'offensive', 'normal',
       'offensive', 'offensive', 'offensive', 'offensive', 'offensive',
       'offensive', 'offensive', 'offensive', 'offensive', 'normal',
       'normal', 'offensive', 'offensive', 'offensive', 'offensive',
       'normal', 'normal'

In [104]:
#confusion matrix
import pandas as pd
from sklearn import metrics
cm=metrics.confusion_matrix(y_test,predictions)

In [105]:
cm

array([[ 77, 110],
       [ 41, 441]], dtype=int64)

In [106]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, predictions)
print("Accuracy: "+str(acc)+" %")

Accuracy: 0.7742899850523169 %


In [107]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      normal       0.65      0.41      0.50       187
   offensive       0.80      0.91      0.85       482

    accuracy                           0.77       669
   macro avg       0.73      0.66      0.68       669
weighted avg       0.76      0.77      0.76       669



In [114]:
documents = [ "زبالة" , "عسل"   ]
test=cv.transform(documents)

In [115]:
model.predict(test)

array(['offensive', 'normal'], dtype=object)