# Installing package data and do the processsing

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [2]:
!pip install pyarabic
!pip install emoji
!pip install pystemmer
!pip install optuna==2.3.0
!pip install transformers -U 

In [3]:
!pip install gdown

In [4]:
import numpy as np
import pandas as pd
import pyarabic.araby as ar

import re , emoji, Stemmer, functools, operator, string
import torch , gc, random, os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset


import logging

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [38]:
# REmove english words and stopping words
from nltk.corpus import stopwords
from nltk import word_tokenize

stopwords = stopwords.words("arabic")

def remove_stopwords(text):
  text_tokens = word_tokenize(text)

  tokens_without_sw = [word for word in text_tokens if not word in stopwords]

  return " ".join(tokens_without_sw)

def no_english(text):
	text = re.sub("[a-zA-Z]+", "",text)
	return text

In [39]:
# Our preprocessing to the data
st =  Stemmer.Stemmer('arabic')
def data_cleaning (text):
  text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"https\S+", "", text)
  text = re.sub(r'\s+', ' ', text)
  text = re.sub("(\s\d+)","",text) 
  text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", text)
  text = re.sub("\d+", " ", text)
  text = ar.strip_tashkeel(text)
  text = ar.strip_tatweel(text)
  text = text.replace("#", " ")
  text = text.replace("@", " ")
  text = text.replace("_", " ")
  translator = str.maketrans('', '', string.punctuation)
  text = text.translate(translator)
  em = text
  em_split_emoji = emoji.get_emoji_regexp().split(em)
  em_split_whitespace = [substr.split() for substr in em_split_emoji]
  em_split = functools.reduce(operator.concat, em_split_whitespace)
  text = " ".join(em_split)
  text = re.sub(r'(.)\1+', r'\1', text)
  text_stem = " ".join([st.stemWord(i) for i in text.split()])
  text = text +" "+ text_stem
  text = text.replace("آ", "ا")
  text = text.replace("إ", "ا")
  text = text.replace("أ", "ا")
  text = text.replace("ؤ", "و")
  text = text.replace("ئ", "ي")
  text = no_english(text) #remocing remaining english words
  text = remove_stopwords(text) #removing stopwords
    
  return text

In [40]:
#Download the data set
!gdown --id 1ewF8bx1KBLZTTeKWg4v5hzpo-cfHdy1O -O ./CY_data.txt

In [34]:
Test_Data_File = "./CY_data.txt"
df_cy = pd.read_csv(Test_Data_File, sep=",")
df_cy.columns = ['tweet','class']

In [35]:
#spliting the data to trian and test
train, test = train_test_split(df_cy, test_size=0.2, stratify=df_cy['class'])

In [41]:
# Cleaning Training Data 
train["tweet"] = train["tweet"].apply(lambda x:   data_cleaning(x))


train.columns = ['tweet','class']

train['tweet'].head(50)

In [43]:
#checking unique values.
train['class'].value_counts()

In [47]:
# First setting the max_len , will be useful later for BERT Model
Max_Len = 512

#Spliting the Training data
Test_Size = 0.05 # low percentage to keep the training data as large as possible,
                   
Rand_Seed = 42 

train_set, evaluation_set = train_test_split( train, test_size= Test_Size, random_state= Rand_Seed)

print("Train set: ")
print(train_set["class"].value_counts())
print("---------------------------")
print ("Evaluation set: ")
print (evaluation_set["class"].value_counts())

In [49]:
Tweets_Text_Col_Test = "tweet"

test_data = test
test_data.columns = ['tweet','class']

test_data[Tweets_Text_Col_Test] = test_data[Tweets_Text_Col_Test].apply(lambda x:   data_cleaning(x))
test_data[Tweets_Text_Col_Test].head()

# Getting our model ready MARBERT.

In [50]:
Model_Used = "UBC-NLP/MARBERT"
Task_Name = "classification"

# out class for the data
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list
        
#Constructing class to get the data ready for bert
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTModelDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
  
    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
    
      encoded_review = self.tokenizer(
      text,
      max_length= self.max_len,
      add_special_tokens= True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation='longest_first',
      return_attention_mask=True,
      return_tensors='pt',
      
    )
      input_ids = encoded_review['input_ids'].to(device)
      attention_mask = encoded_review['attention_mask'].to(device)

      return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

In [51]:
# Init model
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

# our metrics
def compute_metrics(p): 
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  weighted_f1 = f1_score(p.label_ids,preds,average='weighted')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : weighted_f1,
      'accuracy': acc
  }

#setting the seed to reproduce
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [52]:
label_list = list(train_set[Tweets_Sentiment_Col_Train].unique())

print(label_list)
print(train_set[Tweets_Sentiment_Col_Train].value_counts())

data_set = Dataset( "KAUST", train_set, evaluation_set, label_list )

#maping our labels to int
label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)


# passing the data to out model
train_dataset = BERTModelDataset(train_set[Tweets_Text_Col_Train].to_list(),
                                 train_set[Tweets_Sentiment_Col_Train].to_list(),Model_Used,Max_Len,label_map)

evaluation_dataset = BERTModelDataset(evaluation_set[Tweets_Text_Col_Train].to_list(),
                                      evaluation_set[Tweets_Sentiment_Col_Train].to_list(),Model_Used,Max_Len,label_map)

In [54]:
#define training arguments
training_args = TrainingArguments("./train")
training_args.lr_scheduler_type = 'cosine'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 
training_args.learning_rate = 5e-05 
training_args.fp16 = True
training_args.per_device_train_batch_size = 16 #64 
training_args.per_device_eval_batch_size = 16 # 64 
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 10
training_args.warmup_steps = 500 
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.logging_steps = 200
training_args.save_steps = 100000 
training_args.seed = 50
training_args.disable_tqdm = False

In [55]:
# passing all to trainer
training_args.dataloader_pin_memory = False
gc.collect()
torch.cuda.empty_cache()
set_seed(Rand_Seed) 

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)

print(training_args.seed)

In [57]:
#Train
trainer.train()

In [58]:
#prediction fucntion
def predict(text, tokenizer):
 
  encoded_review = tokenizer.encode_plus(
    text,
    max_length=Max_Len,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True, #True,
    truncation='longest_first',
    return_attention_mask=True,
    return_tensors='pt'
  )

  input_ids = encoded_review['input_ids'].to(device) #(input_ids + ([tokenizer.pad_token_id] * padding_length)).to(device)  
  attention_mask = encoded_review['attention_mask'].to(device)
    

  output = trainer.model(input_ids, attention_mask)
  _, prediction = torch.max(output[0], dim=1)
  return prediction[0]

#then lets play !

tokenizer = AutoTokenizer.from_pretrained(Model_Used)

prediction_list = []
i = 0
for tweet in test_data[Tweets_Text_Col_Test]:
  
    pre = predict(tweet,tokenizer)
    pre_txt = label_list[pre]
    
    prediction_list.append(pre_txt)
    
    i = i + 1

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
accuracy_score(prediction_list, test_data['class'])

# Here we got 73% with ten epoch we may get a higher accuracy with a longer training period.