In [5]:
# install necessary libraries

!pip install transformers 
!pip install torch==6.0
!pip install datasets
!pip install Arabic-Stopwords

from IPython.display import clear_output
clear_output()

In [6]:
# https://github.com/motazsaad/process-arabic-text/blob/master/clean_arabic_text.py

import re
import string
import sys
import argparse
import arabicstopwords.arabicstopwords as stp

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('arabic')

extra_stopwords = ['ال']
all_arabic_stopwords = list(stp.stopwords_list()) + stopwords_list + extra_stopwords


arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def remove_diacritics(text):
  text = re.sub(arabic_diacritics, '', text)
  return text

def remove_repeating_char(text):
  return re.sub(r'(.)\1+', r'\1', text)

def remove_newlines(text):
  text = re.sub('\n'," . ",text)
  return text  

def remove_stopwords(text):
  tokens = text.split(' ')
  for word in tokens:
    if word in string.punctuation: 
      continue
    # print(word)
    if word.replace(" ", "") in all_arabic_stopwords:
      # word = word.replace(" ", "")
      text = re.sub(r'\b'+word + r'\b', "", text)
      text = re.sub(r'^' +word + r'\s', "", text)
      text = re.sub(r'\s'+word+ r'$', "", text)

  return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def replace_emoji(text): 
    text = re.sub(r'<3|< 3|❤️|💖|😍|💕|😘|🥰|💕|💝|💗|💜|💙|🖤|💚|💛|🤍|❤',
                  ' قلب ',
                  text) 

    text = re.sub(r':P|:-P|😂|🤣',
                  ' ضحك ',
                  text)

    text = re.sub(r'[☺😌😁😃😄😆😊😸😺😊😀😋☺️🙂💃]',
                  ' سعادة ',
                  text)

    text = re.sub(r'[😥😣😓😔😕☹️🙁😖😞😟😢😭😩😿😫😩💔]',
                  ' حزن  ',
                  text)
    text = re.sub(r'(::|\)-:)',
                  '  حزن  ', 
                  text)
    text = re.sub(r'(:,\(|:\'\(|:"\()',
                  ' حزن ', 
                  text)

    text = re.sub(r'[😨😱😵]',
                  ' مفاجأة ', 
                  text)

    text = re.sub(r'[😳😅🙈]',
                  ' محرج ', 
                  text)

    text = re.sub(r'[😤😠😡🤬👿]',
                  ' غضب ', 
                  text)

    text = re.sub(r'[😑😒🙄😐😶]',
                  ' ملل ', 
                  text)

    text = re.sub('[\U0001F600-\U0001FFFF]'," ", text)
    text = re.sub('[\U0001F300-\U0001F5FF]'," ", text)
    text = re.sub('[\U0001F680-\U0001F6FF]'," ", text)
    
    return text




[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


* |0 |none | 1550 | 
* |1 |anger | 1444 | 
* |2 |joy | 1281 | 
* |3 |sadness | 1256 | 
* |4 |love | 1220 | 
* |5 |sympathy | 1062 | 
* |6 |surprise | 1045 | 
* |7 |fear | 1207 |

In [34]:
import datasets
from datasets import load_dataset
import pandas as pd

dataset = load_dataset('emotone_ar')  
df = dataset['train'].to_pandas()
df

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,tweet,label
0,الاوليمبياد الجايه هكون لسه ف الكليه ..,0
1,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,1
2,كتنا نيله ف حظنا الهباب xD,3
3,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,2
4,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,0
...,...,...
10060,2222: يلا يا جماعه حفله عمرو دياب خلصت نريح شو...,3
10061,Mohamed5: اييييه دااا 😲😲 اوزيييل❤,6
10062,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد 😃,0
10063,وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس,2


In [8]:
import emoji
def extract_emojis(s):
  return [c for c in s if c in emoji.EMOJI_DATA]

def split_count(text):
    emoji_list = []
    data = text
    for word in data:
        if word in emoji.EMOJI_DATA:
            emoji_list.append(word)
    
    return emoji_list


text = df['tweet']
emoji_list= [] 
for t in text:
  emoji_list=emoji_list+split_count(t)

from collections import Counter

print(Counter(emoji_list))

Counter({'😂': 730, '❤': 311, '💔': 282, '😍': 132, '😭': 122, '✋': 95, '♥': 87, '😢': 86, '👏': 79, '😔': 74, '😅': 56, '💜': 49, '🏻': 45, '💙': 39, '😞': 38, '💕': 38, '✌': 38, '🏼': 35, '👌': 33, '😄': 30, '😊': 30, '😌': 29, '😀': 25, '😒': 25, '💪': 25, '🌹': 24, '🙈': 24, '🙏': 23, '🤔': 23, '✨': 23, '😏': 22, '😴': 22, '😐': 22, '😓': 21, '😕': 20, '🙂': 20, '☺': 19, '😑': 18, '😳': 18, '👍': 18, '🙄': 17, '💗': 17, '🌸': 16, '😎': 16, '😃': 16, '😩': 15, '💭': 15, '💞': 15, '💚': 14, '💛': 14, '👊': 13, '🚶': 13, '🌚': 11, '💃': 11, '🎶': 11, '🏽': 10, '💘': 10, '🏃': 10, '😶': 10, '😨': 9, '❣': 9, '😱': 9, '😣': 9, '😫': 9, '🍃': 8, '😜': 8, '🎵': 8, '🙃': 8, '💟': 8, '🐸': 8, '🔸': 8, '😉': 7, '🎼': 7, '🔥': 7, '🌷': 7, '🍁': 7, '😁': 6, '‼': 6, '😪': 6, '😷': 6, '😖': 6, '🍂': 6, '😡': 6, '🙊': 5, '👐': 5, '👇': 5, '🎻': 5, '🤕': 5, '☝': 5, '😤': 5, '✊': 5, '💖': 5, '😥': 5, '🕊': 5, '🎉': 5, '✒': 5, '😟': 5, '👎': 5, '😬': 5, '🚬': 5, '😆': 5, '🏿': 5, '❗': 5, '🎬': 5, '💋': 4, '💵': 4, '💎': 4, '😻': 4, '☹': 4, '☕': 4, '💐': 4, '🔕': 4, '😰': 4, '🌿': 4, '🌼': 4, '😿': 4,

# Analysis

Initial pre-processing before analysis

In [9]:
tweets = df['tweet']
tweets_processed = [remove_stopwords(remove_punctuations(replace_emoji(remove_repeating_char(remove_newlines(remove_diacritics(item)))))) for item in tweets]

In [12]:
df['tweets_processed']=tweets_processed

In [13]:
df

Unnamed: 0,tweet,label,tweets_processed
0,الاوليمبياد الجايه هكون لسه ف الكليه ..,0,الاوليمبياد الجايه هكون لسه الكليه
1,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...,1,عجز الموازنه وصل ل937 الناتج المحلي يعني لسه...
2,كتنا نيله ف حظنا الهباب xD,3,كتنا نيله حظنا الهباب xD
3,جميعنا نريد تحقيق اهدافنا لكن تونس تالقت في حر...,2,نريد تحقيق اهدافنا تونس تالقت حراسه المرمي
4,الاوليمبياد نظامها مختلف .. ومواعيد المونديال ...,0,الاوليمبياد نظامها مختلف ومواعيد المونديال مك...
...,...,...,...
10060,2222: يلا يا جماعه حفله عمرو دياب خلصت نريح شو...,3,2 يلا جماعه حفله عمرو دياب خلصت نريح شويه ونب...
10061,Mohamed5: اييييه دااا 😲😲 اوزيييل❤,6,Mohamed5 ايه دا اوزيل قلب
10062,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد 😃,0,عملتلها ريتويت بمناسبه ساره بتاعه الاوليمبياد ...
10063,وعليك قبلنا يانجم النجوم ياعندليب الحب والاحساس,2,قبلنا يانجم النجوم ياعندليب الحب والاحساس


In [None]:
df.head(40)

In [None]:
# !pip install arabic_reshaper
# !pip install python-bidi
# !pip install emoji

In [None]:
# from wordcloud import WordCloud
# import arabic_reshaper
# from bidi.algorithm import get_display
# import pandas as pd
# import emoji
# import matplotlib.pyplot as plt
# from wordcloud import WordCloud

In [None]:
# grouped = df.groupby('label')
# for label, group in grouped:
#     text = ' '.join(group['tweets_processed'])
#     text = emoji.replace_emoji(text, replace='', version=-1)
#     reshaped_text = arabic_reshaper.reshape(text)
#     bidi_text = get_display(reshaped_text)
#     wordcloud = WordCloud(font_path='/kaggle/input/arabic-character-format/NotoNaskhArabic-Regular.ttf').generate(bidi_text)
#     wordcloud.to_file("worCloud.png")
#     plt.figure(figsize = (8, 8), facecolor = None)
#     plt.imshow(wordcloud)
#     plt.axis("off")
#     plt.tight_layout(pad = 0)
#     plt.title(label)
#     plt.show()

In [None]:
import pandas as pd
from collections import Counter

grouped = df.groupby('label')

for label, group in grouped:
    text = ' '.join(group['tweets_processed'])
    words = text.split()
    word_count = Counter(words)
    print(str(label) + ' words:')
    for word, count in word_count.most_common(10):
        print(f'{word}: {count}')
    print('...........................')

|0 |none | 1550 |
|1 |anger | 1444 |
|2 |joy | 1281 |
|3 |sadness | 1256 |
|4 |love | 1220 |
|5 |sympathy | 1062 |
|6 |surprise | 1045 |
|7 |fear | 1207 |

## Data Preprocessing (2)

In [42]:
# def text_preprocessing(text):
#     text = replace_emoji(remove_repeating_char(remove_newlines(text)))
#     return text

## Model Finetuning 

In [3]:
# import os
# import re
# from tqdm import tqdm
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import csv

# %matplotlib inline

In [4]:
# import torch

# if torch.cuda.is_available():       
#     device = torch.device("cuda")
#     print(f'There are {torch.cuda.device_count()} GPU(s) available.')
#     print('Device name:', torch.cuda.get_device_name(0))

# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [15]:
# # import a model from huggingface
# from transformers import AutoTokenizer, AutoModel
# from transformers import BertTokenizer

# tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/MARBERT')
# # tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [44]:
# all_tweets = temp_df.tweet.values

# encoded_tweets = [tokenizer.encode(text_preprocessing(sent), add_special_tokens=True) for sent in all_tweets]

# # Find the maximum length
# max_len = max([len(sent) for sent in encoded_tweets])
# print('Max length: ', max_len)

Max length:  58


In [45]:

# # Specify max length to trancuate/pad to
# MAX_LEN = 70

# def preprocessing_for_bert(data, text_preprocessing_fn = text_preprocessing ):
#     """Perform required preprocessing steps for pretrained BERT.
#     @param    data (np.array): Array of texts to be processed.
#     @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
#     @return   attention_masks (torch.Tensor): Tensor of indices specifying which
#                   tokens should be attended to by the model.
#     """
#     # Create empty lists to store outputs
#     input_ids = []
#     attention_masks = []

#     # For every sentence...
#     for i,sent in enumerate(data):
#         # `encode_plus` will:
#         #    (1) Tokenize the sentence
#         #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
#         #    (3) Truncate/Pad sentence to max length
#         #    (4) Map tokens to their IDs
#         #    (5) Create attention mask
#         #    (6) Return a dictionary of outputs
#         encoded_sent = tokenizer.encode_plus(
#             text=text_preprocessing_fn(r""+sent),  # Preprocess sentence
#             add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
#             max_length=MAX_LEN,                  # Max length to truncate/pad
#             padding='max_length',        # Pad sentence to max length
#             return_attention_mask=True,     # Return attention mask
#             truncation = True 
#             )
        
#         # Add the outputs to the lists
#         input_ids.append(encoded_sent.get('input_ids'))
#         attention_masks.append(encoded_sent.get('attention_mask'))
#     # Convert lists to tensors
#     input_ids = torch.tensor(input_ids)
#     attention_masks = torch.tensor(attention_masks)

#     return input_ids, attention_masks