In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True) # Mount google drive to load training and testing data

Mounted at /content/drive


In [2]:
# This where all the Packages are cached instead or reinstalling them every new runtime
PACKAGES_DIR = '/content/drive/My Drive/Bachelor/pip_cache'

# This is where MERGED & PREPROCESSED & CLEANED Dataset is
PREPROCESSED_MERGED_DATASET_DIR = '/content/drive/MyDrive/Bachelor/Sentiment/PREPROCESSED_DATASET/'

# Topic Modeling MODELS DIRECTORY
ARA2VEC_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/ARA2VEC/'

# Topic Modeling MODELS DIRECTORY
DL_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/DL_MODELS/'


# Topic Modeling MODELS DIRECTORY
BERT_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/BERT/'

# Topic Modeling MODELS DIRECTORY FOR TOP2VEC
TOP2VEC_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/TOP2VEC/'

# Topic Modeling MODELS DIRECTORY FOR LDA
LDA_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/LDA/'

# Topic Modeling MODELS DIRECTORY FOR NMF
NMF_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/NMF/'


# This is where the figures are saved
FIGURES_DIR = '/content/drive/MyDrive/Bachelor/Sentiment/FIGURES/'


  #########################################################################  TensorFlow ##########################################################

# Directory where the Topic_Modeling models will be saved 
TF_RECURRENT_MODEL_DIR = '/content/drive/MyDrive/Bachelor/Topic_Modeling/DL_MODELS/recurrent/tf/model/'



# Number of Figures
FIGURE_COUNTS = 1



## Packages

In [3]:
# Set locale encoding to UTF-8
import os
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

In [4]:
# Install bertopic, flair, and top2vec with cache directory

!sudo pip install --cache-dir '/content/drive/My Drive/Bachelor/pip_cache' wordcloud-fa
!sudo pip install --cache-dir '/content/drive/My Drive/Bachelor/pip_cache' transformers



from IPython.display import clear_output
clear_output() # clear output window

In [5]:
%tensorflow_version 2.x
# Packages 
# types in python
from typing import List, Tuple ,Dict ,Any , Union ,Optional
import time
import string
import os
import random

# Data Manipulating  & Preprocessing packages
import numpy as np
from sklearn.preprocessing import LabelEncoder 
import unicodedata # normlization of arabic letters encoding to be unicoded 
import pandas as pd
pd.set_option('display.max_colwidth', None) # Setting the display option to show the full width of columns in pandas dataframe.




# Model
import torch
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow import keras

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import pipeline
# Metrics
from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix,roc_curve

 
import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings("ignore")

# visualization library
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

# Random Seed
RANDOM_SEED=42
RANDOM_STATE=42



def set_seed(seed=42):
  RANDOM_SEED=seed
  RANDOM_STATE=seed
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark = False



# Set Matplotlib defaults
plt.style.use('ggplot')
#plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)



Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [6]:
def wordcloud_visualize(tweet_series:pd.Series,save_dir=None,title=None)->None:
  """
  tweet_series = Series of TWEETs in text
  generate wordcloud based on frequencies
  """
  if not tweet_series and len(tweet_series)==0 :
    return
  from wordcloud_fa import WordCloudFa
  text = ' '.join([str(elem) for elem in tweet_series])

  wordcloud = WordCloudFa(persian_normalize = True,no_reshape=True,collocations=False,include_numbers=False)
  frequencies = wordcloud.process_text(text)
  wc = wordcloud.generate_from_frequencies(frequencies)
  #image = wc.to_image()
  # Plot wordcloud with title
  plt.figure(figsize=(10, 6))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis('off')
  plt.title(f"{title}", fontsize=16)  # Add title to the plot

  
  #image.show()

  if save_dir is not None: 
     wordcloud.to_file(f"{save_dir}{title}.png") 
  
  plt.show()


In [7]:
def scale_list_to_sum_to_100(lst):
    # Compute the sum of the list
    total = sum(lst)

    # Determine the scaling factor
    scaling_factor = 100 / total

    # Multiply each float by the scaling factor
    scaled_lst = [int(round(x * scaling_factor)) for x in lst]

    # Adjust the values to ensure they sum to 100
    diff = 100 - sum(scaled_lst)
    if diff > 0:
        # If sum is less than 100, add the difference to the first element
        scaled_lst[0] += diff
    elif diff < 0:
        # If sum is greater than 100, subtract the difference from the first element
        scaled_lst[0] -= diff

    return scaled_lst

def wordcloud_visualize_proba(tweet_series:List[str],proba:Optional[List[float]]=None,save_dir:Optional[str]=None,title:Optional[str]=None)->None:
  """
  tweet_series = Series of TWEETs in text
  generate wordcloud based on frequencies
  """
  if not tweet_series and len(tweet_series)==0 :
    return
  from wordcloud_fa import WordCloudFa
  if proba is not None:
    text = ' '.join([str(elem)*int(proba[i]*10) for i,elem in enumerate(tweet_series)])
  if proba is None:
    text = ' '.join([str(elem) for elem in tweet_series])


  wordcloud = WordCloudFa(persian_normalize = True,no_reshape=True,collocations=False,include_numbers=False)
  frequencies = wordcloud.process_text(text)
  wc = wordcloud.generate_from_frequencies(frequencies)
  #image = wc.to_image()
  # Plot wordcloud with title
  plt.figure(figsize=(10, 6))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis('off')
  plt.title(f"{title}", fontsize=16)  # Add title to the plot

  
  #image.show()

  if save_dir is not None: 
     wordcloud.to_file(f"{save_dir}{title}.png") 
  
  plt.show()


In [8]:
import logging

# Set logging level to verbose
logging.basicConfig(level=logging.DEBUG)

### My Preprocessed Labeled Dataset
> Consists of Egyptian and Modern Standard Arabic (MSA) .

> 
```
{'LABEL': Value(dtype='int64', id=None),
 'TWEET': Value(dtype='string', id=None)}
```
> Label explaination :

```
SENTIMENT_TO_ID = {
   "positive": 0 ,
   "negative": 1,
   "neutral": 2,
}

ID_TO_SENTIMENT = {
   0:"positive" ,
   1:"negative" ,
   2:"neutral",
   } 
```

In [9]:
SENTIMENT_TO_ID = {
   "positive": 0 ,
   "negative": 1,
   "neutral": 2,
}

ID_TO_SENTIMENT = {
   0:"positive" ,
   1:"negative" ,
   2:"neutral",
}

In [10]:
# Dataset names 
TRAIN_DATASET_NAME='TRAIN_DATASET'
VALIDDATION_DATASET_NAME='VALIDATION_DATASET'
TEST_DATASET_NAME='TEST_DATASET'
DATASET_NAME = 'DATASET'

stemmed_data_files = {
              "train": f"{os.path.join(PREPROCESSED_MERGED_DATASET_DIR , TRAIN_DATASET_NAME + '.csv')}",
              "validation":f"{os.path.join(PREPROCESSED_MERGED_DATASET_DIR, VALIDDATION_DATASET_NAME + '.csv' )}",
              "test": f"{os.path.join(PREPROCESSED_MERGED_DATASET_DIR ,TEST_DATASET_NAME + '.csv')}",
              #"dataset": f"{PREPROCESSED_MERGED_DATASET_DIR +DATASET_NAME}.csv",
              }

unstemmed_data_files = {
              "train": f"{os.path.join(PREPROCESSED_MERGED_DATASET_DIR , 'unstemmed/'+TRAIN_DATASET_NAME + '.csv')}",
              'validation':f"{os.path.join(PREPROCESSED_MERGED_DATASET_DIR, 'unstemmed/'+VALIDDATION_DATASET_NAME + '.csv' )}",
              "dataset": f"{PREPROCESSED_MERGED_DATASET_DIR +DATASET_NAME}.csv",
              }

In [11]:
data=  pd.read_csv(unstemmed_data_files['dataset'])
data.head()

Unnamed: 0,LABEL,TWEET
0,2,اوليمبياد جاي هكون لسه كلي
1,1,عجز مواز وصل ناتج محل عن لسه رقم وحش لسه تابع اوليمبياد
2,1,تنا وحش حظ هباب
3,0,جميع نريد تحقيق اهداف تونس حلو وحش مرم
4,2,اوليمبياد نظام حلو مواعيد مونديال مكانتش وحش مش حاج معقول


In [12]:
documents = data['TWEET'].values

## Zero-shot Classification

In [16]:
checkpoint = 'joeddav/xlm-roberta-large-xnli'

In [22]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [29]:
# we will classify the Russian translation of, "Who are you voting for in 2020?"
sequence_to_classify = documents[0]
# we can specify candidate labels in Russian or any other language above:
candidate_labels = ["اوليمبياد", "وزارة الصحة", "وزارة"  	,'الدفاع','الدولة الإنتاج الحربي','الداخلية','الخارجية','التعاون الدولي','البترول والثروة المعدنية','الكهرباء والطاقة','الأوقاف','النقل','الطيران المدني','التنمية المحلية','التخطيط والتنمية الاقتصادية','الزراعة واستصلاح الأراضي','الموارد المائية والري','التضامن الاجتماعي','التموين والتجارة الداخلية','الثقافة','الاتصالات','المالية','قطاع الأعمال','التجارة والصناعة','السياحة والآثار','التعليم العالي والبحث العلمي','التربية والتعليم','القوى العاملة','الدولة للهجرة وشئون المصريين بالخارج','العدل','المجالس النيابية','الصحة والسكان','الإسكان والمرافق والمجتمعات العمرانية','البيئة','الشباب والرياضة','الدولة للإعلام ']
classifier(sequence_to_classify, candidate_labels)
# {'labels': ['politics', 'Europe', 'public health'],
#  'scores': [0.9048484563827515, 0.05722189322113991, 0.03792969882488251],
#  'sequence': 'За кого вы голосуете в 2020 году?'}

{'sequence': 'اوليمبياد جاي هكون لسه كلي',
 'labels': ['اوليمبياد',
  'الشباب والرياضة',
  'النقل',
  'الدفاع',
  'الأوقاف',
  'الخارجية',
  'الاتصالات',
  'الثقافة',
  'الداخلية',
  'التربية والتعليم',
  'القوى العاملة',
  'التعليم العالي والبحث العلمي',
  'المالية',
  'وزارة',
  'الصحة والسكان',
  'التعاون الدولي',
  'العدل',
  'الدولة للهجرة وشئون المصريين بالخارج',
  'الدولة للإعلام ',
  'الإسكان والمرافق والمجتمعات العمرانية',
  'التنمية المحلية',
  'الطيران المدني',
  'الموارد المائية والري',
  'الكهرباء والطاقة',
  'الدولة الإنتاج الحربي',
  'السياحة والآثار',
  'المجالس النيابية',
  'البترول والثروة المعدنية',
  'البيئة',
  'التضامن الاجتماعي',
  'قطاع الأعمال',
  'التموين والتجارة الداخلية',
  'التجارة والصناعة',
  'الزراعة واستصلاح الأراضي',
  'وزارة الصحة',
  'التخطيط والتنمية الاقتصادية'],
 'scores': [0.18275608122348785,
  0.09384148567914963,
  0.09322676807641983,
  0.054320354014635086,
  0.0460316427052021,
  0.04583355039358139,
  0.034184779971838,
  0.02597711235284

In [25]:
sequence_to_classify

'اوليمبياد جاي هكون لسه كلي'