# **Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import re
import emoji

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.arlstem import ARLSTem

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

<hr><hr>

# **Read Data from the Files**

In [48]:
positive_tweets__file = "Data/PositiveTweets.tsv"
negative_tweets__file = "Data/NegativeTweets.tsv"
bad_words__file = "Data/bad_words.txt"

In [3]:
neg_df = pd.read_csv(negative_tweets__file, sep='\t', header=None, names=["sentiment", "content"], encoding='utf-8')

In [4]:
pos_df = pd.read_csv(positive_tweets__file, sep='\t', header=None, names=["sentiment", "content"], encoding='utf-8')

In [5]:
df = pd.concat([neg_df, pos_df])

In [6]:
df

Unnamed: 0,sentiment,content
0,neg,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم...
1,neg,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين ...
2,neg,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال و...
3,neg,نعمة المضادات الحيوية . تضع قطرة💧مضاد بنسلين ع...
4,neg,الدودو جايه تكمل علي 💔
...,...,...
22756,pos,السحب الليلة على الايفون .. رتويت للمرفقة وطبق...
22757,pos,😂 لابسة احمر ليه يا ست انتي ايه المناسبة 😂
22758,pos,كلاام جمييل تستاهل(من احبه الله جعل محبته ف قل...
22759,pos,- ألطف صورة ممكن تعبر عن رمضان 💙


In [7]:
bad_words = set()
with open(bad_words__file, "r", encoding="utf-8") as file:
    for line in file:
        bad_words.add(line.strip())

<hr><hr>

# **Adding Features**

## 1. Before Pre-processing

In [8]:
df["content_length_before"] = df["content"].apply(len)

In [9]:
df["tokens_count_before"] = df["content"].apply(lambda x: len(word_tokenize(x)))

In [10]:
df["sentences_count_before"] = df["content"].apply(lambda x: len(sent_tokenize(x)))

In [11]:
df["hashtags_count"] = df["content"].apply(lambda x: len(re.findall(r'#', x)))

In [12]:
df["bad_words_count"] = df["content"].apply(lambda x: len([word for word in word_tokenize(x) if word in bad_words]))

---

In [13]:
df["emojis_count"] = df["content"].apply(lambda x: emoji.emoji_count(x))

In [14]:
love_emojis = ['❣', '💍', '🤎', '💌', '🧡', '💙', '💛', '🤍', '💗', '💓', '💋', '💝', '💚', '🖤', '💟', '🔥',
                '\u200d', '🩹', '💘', '💜', '❤', '💟', '💞', '💕', '💖', '😍', '😘', '😗', '😙', '♥️', '😚', '😽',
                '😇', '🫶', '😊', '☺️', '🤗', '🤩', '🥰', '😻', '🙈', '🙊', '🫦', '👄', '🫀', '💏', '💑', '💋']

df['love_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in love_emojis)
)

In [15]:
broken_heart_emoji = ['💔']

df['broken_heart'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in broken_heart_emoji)
)

In [16]:
happy_emojis = ['😄', '😹', '😃', '😆', '😅', '😀', '🤣', '😂', '😁', '😝', '😸', '😇', '🤪', '😎', '🕺', '💃']

df['happy_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in happy_emojis)
)

In [17]:
sad_emojis = ['\U0001fae4', '🥲', '😟', '😩', '😢', '😓', '😥', '🙃', '💅', '😞', '😔', '🙁', '☹️', '😿', '😖',
                '️💔', '😰', '😭', '😣', '☹', '😕', '🥵', '🥴', '🤕', '🤒']

df['sad_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in sad_emojis)
)

In [18]:
smile_emojis = ['🙂', '🙃']

df['smile_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in smile_emojis)
)

In [19]:
thinking_emojis = ['🤔', '🤨', '🧐', '🤓']

df['thinking_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in thinking_emojis)
)

In [20]:
flowers_emojis = ['💐', '🌸', '🏵️', '🌹', '🌺', '🌻', '🌼', '🌷', '🥀', '☘️', '🍁']

df['flowers_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in flowers_emojis)
)

In [21]:
moon_and_sun_emojis = ['🌝', '🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘', '🌙', '🌚', '🌛', '🌜', '☀️', '🌞',
                        '⭐', '🌟', '🌠', '✨']

df['moon_and_sun_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in moon_and_sun_emojis)
)

In [22]:
hand_emojis = ['👉', '✊', '👌', '\U0001faf5', '🤘', '🤞', '🤝', '\U0001faf1', '👐', '👎', '\U0001faf0', '🤟', '🤜',
                '🤲', '👋', '👈', '🤚', '🙌', '🙏', '🤌', '🤏', '\U0001faf2', '👆', '🖐', '💪', '✌', '🤙', '✋',
                '👊', '\U0001faf4', '🖖', '👍', '☝', '\U0001faf3', '👏', '🤛', '👇']

df['hand_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in hand_emojis)
)

In [23]:
surprising_emojis = ['🤨', '😐', '😑', '😶', '😮', '😯', '😲', '😧', '😦', '😨', '😱', '🤯', '😵', '😵‍💫', '🧐']

df['surprising_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in surprising_emojis)
)

In [24]:
angry_emojis = ['😑', '😐', '😤', '😮‍💨', '🤬', '😡', '😠', '🤢', '🤮', '👿', '😈']

df['angry_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in angry_emojis)
)

In [25]:
prohibited_emojis = ['🚫', '🔇', '🔕', '🛑', '🆘', '⛔', '🛑', '📛', '❌', '⭕', '🔞', '☢️']

df['prohibited_emojis'] = df['content'].apply(
    lambda x: sum(emoji in x for emoji in prohibited_emojis)
)

---

In [26]:
def count_substrings(row, substrings):
    count = 0
    for substring in substrings:
        count += row.count(substring)
    return count

In [27]:
df['يارب'] = df['content'].apply(lambda x: count_substrings(x, ['يارب', 'يا رب']))

In [28]:
df['الحمد'] = df['content'].str.count('الحمد')

In [29]:
df['لله'] = df['content'].str.count('لله')

In [30]:
good_words = ['جميل', 'جمال', 'حب', 'خير', 'صباح']
df['good_words'] = df['content'].apply(lambda x: count_substrings(x, good_words))

In [31]:
ad_words = ['عيب', 'غلط', 'تعب', 'كئيب', 'قرف', 'مرض', 'موت', 'سيء', 'مشكل', 'خرا', 'زفت', 'ظلم', 'كذب']
df['bad_words'] = df['content'].apply(lambda x: count_substrings(x, bad_words))

<hr> 

## 2. Define Pre-processing Functions

In [32]:
def remove_stop_words(text):
    stop_words = set(stopwords.words("arabic"))
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)

In [33]:
def remove_non_arabic(text):
    return re.sub(r'[^\u0621-\u064A\s]', '', text)

In [34]:
def remove_consecutive_redundant_characters(text, number_of_consecutive_characters):
    text += "\0"
    result = ""

    count = 1
    prev_char = text[0]

    for i in range(1, len(text)):
        current_char = text[i]
        if current_char == prev_char:
            count += 1
        else:
            if count > number_of_consecutive_characters:
                result += prev_char
            else:
                result += prev_char * count

            count = 1
            prev_char = current_char

    return result

In [35]:
def stem_words(text):
    stemmer = ARLSTem()
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    return " ".join(words)

In [36]:
def normalize_text(text):
    text = remove_stop_words(text)
    text = remove_non_arabic(text)
    text = remove_consecutive_redundant_characters(text, 3)
    text = stem_words(text)
    text = remove_stop_words(text)

    return text

<hr> 

## 3. After Pre-processing

In [37]:
df["normalized_content"] = df["content"].apply(normalize_text)

In [38]:
df["content_length_after"] = df["normalized_content"].apply(len)

In [39]:
df["tokens_count_after"] = df["normalized_content"].apply(lambda x: len(word_tokenize(x)))

<hr><hr>

# **Building ML Models**

In [40]:
X = df.drop(columns=['sentiment', 'content', 'normalized_content'])
y = df['sentiment']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [42]:
def print_measures(y_test, y_pred):
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    print("Precision:", metrics.precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", metrics.recall_score(y_test, y_pred, average='weighted'))
    print("F-measure:", metrics.f1_score(y_test, y_pred, average='weighted'))
    print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))

<hr>

**1. Decision Tree (Random Forest) Model**

In [43]:
clf = RandomForestClassifier(random_state=123)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_measures(y_test, y_pred)

Accuracy: 0.8856659059118015
Precision: 0.8857288463426933
Recall: 0.8856659059118015
F-measure: 0.8856338009882809
Confusion Matrix:
 [[5790  834]
 [ 719 6240]]


<hr>

**2. XGBoost Model**

In [44]:
le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train)
y_test_xgb = le.transform(y_test)

clf = xgb.XGBClassifier()
clf = clf.fit(X_train, y_train_xgb)
y_pred = clf.predict(X_test)

print_measures(y_test_xgb, y_pred)

Accuracy: 0.8635058529043658
Precision: 0.8635123373424456
Recall: 0.8635058529043658
F-measure: 0.8634834413694568
Confusion Matrix:
 [[5660  964]
 [ 890 6069]]


<hr>

**3. Neural Networks Model**

In [45]:
clf = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_measures(y_test, y_pred)

Accuracy: 0.8614444526246043
Precision: 0.8614892148026398
Recall: 0.8614444526246043
F-measure: 0.8614551589051935
Confusion Matrix:
 [[5708  916]
 [ 966 5993]]


<hr>

**4. KNN Model**

In [46]:
clf = KNeighborsClassifier(n_neighbors=3)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_measures(y_test, y_pred)

Accuracy: 0.7963631009349923
Precision: 0.7969107407291351
Recall: 0.7963631009349923
F-measure: 0.7963882673390669
Confusion Matrix:
 [[5361 1263]
 [1503 5456]]


<hr>

**5. Naive Base Model**

In [47]:
clf = GaussianNB()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_measures(y_test, y_pred)

Accuracy: 0.783626592063609
Precision: 0.8278724586210388
Recall: 0.783626592063609
F-measure: 0.7747507496508554
Confusion Matrix:
 [[3891 2733]
 [ 206 6753]]
