<a href="https://colab.research.google.com/github/mazintaha49/open-source-arabic-sentiment-analysis/blob/main/research_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Load dependencies

In [2]:
# loading dependencies (i.e. importing)


# nltk, (multi NLP tasks related  model)
import nltk
nltk.download('punkt')
# pandas, data frame (rows, colomns) processing model
import pandas as pd
# numpy, numeric arrays and matrices (rows, colomns) processing model
import numpy as np
# train_test_split from sklearn model
from sklearn.model_selection import train_test_split
# re, provides regular expression matching operations
import re
# string, Common string operations
import string
# TweetTokenizer, a tokenizer form NLTK model
from nltk.tokenize import TweetTokenizer
# tokenizer, defining the specifications of the tokenizer, and assigning a name to this specifications
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
# Arabic-Stopwords, (the list of stopwords to be removed)
import arabicstopwords.arabicstopwords as stp
# stemmer, a arabic stemmer from snowballstemmer model
from snowballstemmer import stemmer
stemmer_arb = stemmer("arabic")
# pyarabic (arabic text processing library)
from pyarabic import araby

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Stopwords

In [3]:
# defining a list of the stopwords
stp_wd_lis = list(stp.stopwords_list())
# printing the length of the list, i.e. stopwords count
print("stopwords count \n", len(stp_wd_lis))
# printing the first 5 elements
print("first 5 elements \n", stp_wd_lis[:5])

stopwords count 
 13629
first 5 elements 
 ['وعداهم', 'جنبنا', 'فباللتين', 'وبماذا', 'وأخونا']


## Load Dataset

In [4]:
# load dataset
path= 'https://raw.githubusercontent.com/mazintaha49/open-source-arabic-sentiment-analysis/main/dataset.csv'
df_text_label= pd.read_csv(path, encoding= 'utf-8-sig')

# printing dataset dataframe sample
print(df_text_label.head())

                                          tweet_text  label
0                               قرفت انتحر عن الدرج    -1.0
1                      صدقت يا رب إجعلنا راضين بقدرك    1.0
2                            قرف شوهاد شي بخزي عنجد    -1.0
3  صحح الخبر . مش تحرير المشتقات النفطية و إنما ت...   -1.0
4  قصة جميلة جدا تعكس معنى الايمان و التمسك بالعق...    1.0


## Process Dataset

In [5]:
# view dataset info
df_text_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1991 entries, 0 to 1990
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tweet_text  1986 non-null   object 
 1   label       1980 non-null   float64
dtypes: float64(1), object(1)
memory usage: 31.2+ KB


In [6]:
# view dataset describtive info
df_text_label.describe(include= 'all')

Unnamed: 0,tweet_text,label
count,1986,1980.0
unique,1966,
top,للأسف البعض يعتقد ان المفاعل النووي سيحل مشاكل...,
freq,4,
mean,,0.0
std,,1.000253
min,,-1.0
25%,,-1.0
50%,,0.0
75%,,1.0


In [9]:
# view labels distribution
print("\n '\033[4m' initial class distribution '\033[0m' \n\n", df_text_label.groupby('label')['tweet_text'].nunique())


 '[4m' initial class distribution '[0m' 

 label
-1.0    968
 1.0    987
Name: tweet_text, dtype: int64


In [10]:
# search for null values within tweet_text column
print(df_text_label.tweet_text.isnull().value_counts())

# search for null values within label column
print(df_text_label.label.isnull().value_counts())

False    1986
True        5
Name: tweet_text, dtype: int64
False    1980
True       11
Name: label, dtype: int64


In [11]:
# Drop the null values 
df_text_label.dropna(inplace=True)
# Reset index after drop
df_text_label.reset_index(drop=True, inplace=True)

In [12]:
# confirm drop of null values

# search for null values within tweet_text column
print(df_text_label.tweet_text.isnull().value_counts())

# search for null values within label column
print(df_text_label.label.isnull().value_counts())


False    1975
Name: tweet_text, dtype: int64
False    1975
Name: label, dtype: int64


In [13]:
# check for duplicate values within tweet_text column
df_text_label.tweet_text.duplicated().value_counts()

False    1955
True       20
Name: tweet_text, dtype: int64

In [14]:
# drop duplicate values fro tweet_text column
df_text_label.drop_duplicates(subset=['tweet_text'], inplace= True)
# Reset index after drop
df_text_label.reset_index(drop=True, inplace=True)

In [15]:
# confirm drop of duplicate values

# check for duplicate values within tweet_text column
df_text_label.tweet_text.duplicated().value_counts()

False    1955
Name: tweet_text, dtype: int64

In [16]:
# view dataset describtive info
df_text_label.describe(include= 'all')

Unnamed: 0,tweet_text,label
count,1955,1955.0
unique,1955,
top,وش التخلف هذا,
freq,1,
mean,,0.009719
std,,1.000209
min,,-1.0
25%,,-1.0
50%,,1.0
75%,,1.0


In [19]:
# view labels distribution
print("\n '\033[4m' updated class distribution '\033[0m' \n\n", df_text_label.groupby('label')['tweet_text'].nunique())


 '[4m' updated class distribution '[0m' 

 label
-1.0    968
 1.0    987
Name: tweet_text, dtype: int64


from class distribution, dataset is semi un-biased



## Tweets Processing

## Count word association

In [20]:
# tweet processing function

def tweet_preprocess(tweet):
    
    # lower letter
    tweet = tweet.lower()
    # remove old style retweet text "RT"
    tweet = " ".join(re.sub(r"^RT[\s]+", "", tweet).split())
    # remove urls
    tweet = " ".join(re.sub("(http\S*)|(www\S*)", "", tweet).split())
    # remove only hash # sign from the word
    tweet = " ".join(re.sub(r"#", "", tweet).split())
    # remove mention
    tweet = " ".join(re.sub("(@[A-Za-z0-9]+)|(منشن\S*)", "", tweet).split())
    # remove all digit form from word
    tweet = str("".join(word for word in tweet for char in word if char not in string.digits).split())
    # tokenize tweet
    tweet_token = tokenizer.tokenize(tweet)
    # remove duplicated letters
    tweet_dedup = []
    for word in tweet_token:
        word_dedup = ""
        word_str = str(word)
        word_dedup = "".join(word_str[0])
        for i in range (len(word_str)):
            if i < (len(word_str) - 1):
                init_char = word_str[i]
                next_char = word_str[(i+1)]
                if next_char != init_char:
                    word_dedup += ''.join(next_char)
        tweet_dedup.append(word_dedup)
    # remove "tashkeel", "tatweel", and normal "hamza"
    word_strip = []
    for word in tweet_dedup:
      word = araby.strip_tashkeel(word)
      word = araby.strip_tatweel(word)
      word = araby.normalize_hamza(word, method="tasheel")
      word_strip.append(word)
    # remove punctuation, stopword, and word stemming
    tweet_processed = []
    for word in word_strip:
      # remove punctuation, stopword
      if word not in stp_wd_lis and word not in string.punctuation:
          # word stemming
          tweet_processed.append(stemmer_arb.stemWord(word))

    return tweet_processed

In [21]:
# word association function

def word_association(tweets, labels):
    # naming the dictionary to hold association values (i.e. frequencies)
    association_dictionary = {}
    # defining a set to hold all unique words
    uniq_wd_set= set()
    for tweet, label in zip(tweets, labels):
        for word in tweet_preprocess(tweet):
          # adding only words with two or more letters, and less than seven letters
          if len(word) <= 7 and len(word) > 2:
            # adding word to set (if unique, sets do not accept duplicated values)
            uniq_wd_set.add(word)
            # defining pair (e.g. ("جيد": 1))
            pair = (word, label)
            if pair in association_dictionary:
                # increase count by 1
                association_dictionary[pair] += 1
            else:
                # assign 1 as the association count 
                association_dictionary[pair] = 1

    return association_dictionary  #, #uniq_wd_set

## (train / test) split

In [62]:
# spliting dataset as 60% for tarin 40% for testing
X_train, X_test, y_train, y_test = train_test_split(
    df_text_label.tweet_text, df_text_label.label, test_size=0.4, shuffle= False)

In [63]:
# building the word association dictionary for train
wd_assoc_dict_train = word_association(X_train, y_train)

In [64]:
# view sample of the dictionary keys and values
print("train \n", list(wd_assoc_dict_train.items())[:5])

train 
 [(('قرف', -1.0), 13), (('انتحر', -1.0), 2), (('درج', -1.0), 2), (('صدق', 1.0), 8), (('اجعل', 1.0), 17)]


## Perform count refinement (ratio + log)


In [65]:
# count refinement function

def count_refinement(tweets, wd_assoc_dict):
  # define the new dectionary with refined values
  refin_dict = {}
  # get word count
  for tweet in tweets:
    for word in tweet_preprocess(tweet):
      # positive count
      PN = wd_assoc_dict.get((word, 1.0), 1)
      # negative count
      NN = wd_assoc_dict.get((word, -1.0), 1)
      # total count
      TN = PN + NN
      # Bayes Probability of word (PW)
      PW_p = PN / TN
      PW_n = 1 - PW_p     # Probability must add to 1
      # Word Ratio (WR), first refinement
      WR = PW_p / PW_n
      # Log Word Ratio (LWR), second refinement
      LWR = np.log(WR)
      # update pair value with refined value
      refin_dict[word] = LWR

  return refin_dict

In [66]:
# building the refined dictionary
refin_dict_train = count_refinement(X_train, wd_assoc_dict_train)

In [67]:
# view sample of the refined dictionary
print("train \n", list(refin_dict_train.items())[:5])

train 
 [('قرف', -2.5649493574615367), ('انتحر', -0.6931471805599454), ('درج', -0.6931471805599454), ('صدق', 0.9808292530117263), ('يا', 0.0)]


## Make predictions

In [68]:
# making predictions function

def make_prediction(tweets, refin_dict):

  # define the prediction list
  pred_lis = []
  # define the LWR sum value holder list
  LWR_sum_lis = []
  for tweet in tweets:
    #define the LWR sum value holder
    LWR_sum = 0
    for word in tweet_preprocess(tweet):
      # get word LWR
      wd_LWR = refin_dict.get(word, 0)
      # add word LWR to LWR_sum (for tweet LWR)
      LWR_sum += wd_LWR
    # add LWR_sum to LWR_sum_lis
    LWR_sum_lis.append(LWR_sum)
  
  # make prediction
  for LWR_sum in LWR_sum_lis:
    if LWR_sum > 0.0:
      pred_lis.append(1.0)
    else:
      pred_lis.append(-1.0)

  return pred_lis



In [69]:
pred_lis = make_prediction(X_test, refin_dict_train)

In [70]:
# view sample of the prediction list
print(pred_lis[:5])
# compare label list length prediction list lenght
print(len(y_test))
print(len(pred_lis))

[-1.0, 1.0, -1.0, -1.0, -1.0]
782
782


## Calculate accuracy

In [71]:
# calculate accuracy function

def calc_acc(labels, pred_lis):

  # define error counter
  error = 0
  for label, pred_label in zip(labels, pred_lis):
    # check for un-correct prediction
    if label != pred_label:
      error += 1
  
  # calculate accuracy
  acc = 1 - (error / (len(labels)))

  return acc

In [72]:
acc = calc_acc(y_test, pred_lis)

## Display results

In [73]:
# predictions accuracy result desplay
print("\n '\033[4m' Naive Bayes accuracy '\033[0m' \n\n", np.round((acc*100),2), "%")


 '[4m' Naive Bayes accuracy '[0m' 

 83.25 %


In [74]:
# classification classes distribution
from collections import Counter

label_pred = Counter(pred_lis).items()
print("\n '\033[4m' classification classes distribution '\033[0m' \n\n", label_pred)
# compared to original classes
label_orig = Counter(y_test).items()
print("\n '\033[4m' original classes '\033[0m' \n\n", label_orig)


 '[4m' classification classes distribution '[0m' 

 dict_items([(-1.0, 416), (1.0, 366)])

 '[4m' original classes '[0m' 

 dict_items([(1.0, 401), (-1.0, 381)])
