In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip -q install optuna==2.3.0
!pip -q install transformers==4.2.1
!pip -q install farasapy
!pip -q install pyarabic
!git clone https://github.com/aub-mind/arabert

[?25l[K     |█▎                              | 10 kB 34.5 MB/s eta 0:00:01[K     |██▌                             | 20 kB 31.0 MB/s eta 0:00:01[K     |███▉                            | 30 kB 19.1 MB/s eta 0:00:01[K     |█████                           | 40 kB 15.7 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 10.7 MB/s eta 0:00:01[K     |███████▋                        | 61 kB 10.2 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 9.3 MB/s eta 0:00:01[K     |██████████▏                     | 81 kB 10.3 MB/s eta 0:00:01[K     |███████████▍                    | 92 kB 10.7 MB/s eta 0:00:01[K     |████████████▊                   | 102 kB 9.9 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 9.9 MB/s eta 0:00:01[K     |███████████████▏                | 122 kB 9.9 MB/s eta 0:00:01[K     |████████████████▌               | 133 kB 9.9 MB/s eta 0:00:01[K     |█████████████████▊              | 143 kB 9.9 MB/s eta 0:00:01

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

In [None]:
hard_map = { 0 : 'fake', 1 : 'real', }

df_train = pd.read_csv("drive/My Drive/Arabic_FakeNews/Model/Balance/DataDescr/train_news.csv", header=0,encoding='utf-8')
df_train.label= df_train.label.apply(lambda x: hard_map[x])

df_val = pd.read_csv("drive/My Drive/Arabic_FakeNews/Model/Balance/DataDescr/val_news.csv", header=0,encoding='utf-8')
df_val.label= df_val.label.apply(lambda x: hard_map[x])

label_list_HARD = ['fake', 'real']


In [None]:
df_train.shape, df_val.shape

((5128, 2), (1282, 2))

# Trainer

In [None]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures


In [None]:
# https://huggingface.co/aubmindlab/bert-base-arabertv02
dataset_name = 'NEWS'
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 414

In [None]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

df_train.text = df_train.text.apply(lambda x:   arabert_prep.preprocess(x))
df_val.text = df_val.text.apply(lambda x:   arabert_prep.preprocess(x))

In [None]:
df_val.text

0       أطاحت الروسية داريا كاساتكينا بالفرنسية أليز ك...
1       قالت مصادر أمنية بشمال سيناء إن شرطيا مصريا قت...
2       عاجل قبل قليل ترامب يحس بالإغماء والشك بإصابته...
3       انا _ لله _ وإنا _ إليه _ راجعون توفيت صباح ال...
4       الفيتنامي فام دينه نوين ، وهو تاجر من مدينة هو...
                              ...                        
1277    الأوبئة تنتشر كل مئة عام ، في السنة ذات الرقم ...
1278    وزير الأوقاف : ندرس تقليل زمن الخطبة لصلاة الج...
1279    قالت صحيفة لوفيغارو إن وزارة الداخلية الفرنسية...
1280    بعد ما فاجئت العالم بإسلامها ، مريم بيترونين ت...
1281    نبه أطباء وأكاديميون بريطانيون إلى أن المشروبا...
Name: text, Length: 1282, dtype: object

In [None]:
class BERTDataset():
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
label_map = { v:index for index, v in enumerate(label_list_HARD) }
print(label_map)
train_dataset = BERTDataset(df_train.text.to_list(),df_train.label.to_list(),model_name, max_len,label_map)
val_dataset  = BERTDataset(df_val.text.to_list()   ,df_val.label.to_list()  ,model_name,  max_len, label_map)

{'fake': 0, 'real': 1}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=384.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=824793.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2642362.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=381.0, style=ProgressStyle(description_…




In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  print(confusion_matrix(p.label_ids,preds))

  # macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  # c_repoert= classification_report(p.label_ids,preds))

  return {
      'macro_f1' : macro_f1,
      # 'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc,
      # 'classification_report' :c_repoert
  }

# HyperParameter Search

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [None]:
steps_per_epoch = (len(df_train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

160
1280


In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=val_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543490667.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

# Regular Training

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
# training_args.per_device_train_batch_size =16
# training_args.per_device_eval_batch_size = 16

training_args.per_device_train_batch_size = 16  
training_args.per_device_eval_batch_size = 16


training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(df_train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

# training_args.load_best_model_at_end=True

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

160
1280


In [None]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
) 

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
0,No log,0.191047,0.936157,0.963156,0.915705,0.952418,17.7811,72.099
1,No log,0.08887,0.963489,0.973018,0.954924,0.971919,19.4588,65.883
2,No log,0.094837,0.977859,0.983509,0.972556,0.982839,19.2093,66.739
3,0.101300,0.108049,0.977006,0.979271,0.9748,0.982059,19.3228,66.347
4,0.101300,0.120422,0.973631,0.983364,0.964875,0.979719,19.3743,66.17
5,0.101300,0.102867,0.979986,0.982736,0.977322,0.984399,19.3598,66.22
6,0.006600,0.106157,0.97991,0.984589,0.975472,0.984399,19.3687,66.189
7,0.006600,0.107025,0.978886,0.984049,0.974014,0.983619,19.2989,66.429


  self.args.max_grad_norm,


              precision    recall  f1-score   support

           0       0.98      0.84      0.90       343
           1       0.94      0.99      0.97       939

    accuracy                           0.95      1282
   macro avg       0.96      0.92      0.94      1282
weighted avg       0.95      0.95      0.95      1282

[[287  56]
 [  5 934]]
              precision    recall  f1-score   support

           0       0.98      0.92      0.95       343
           1       0.97      0.99      0.98       939

    accuracy                           0.97      1282
   macro avg       0.97      0.95      0.96      1282
weighted avg       0.97      0.97      0.97      1282

[[315  28]
 [  8 931]]
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       343
           1       0.98      0.99      0.99       939

    accuracy                           0.98      1282
   macro avg       0.98      0.97      0.98      1282
weighted avg       0.98     

TrainOutput(global_step=1280, training_loss=0.042300854623317716, metrics={'train_runtime': 1675.6895, 'train_samples_per_second': 0.764, 'total_flos': 13774160747798208, 'epoch': 8.0})

In [None]:
trainer.evaluate(val_dataset)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97       343
           1       0.98      0.99      0.99       939

    accuracy                           0.98      1282
   macro avg       0.98      0.97      0.98      1282
weighted avg       0.98      0.98      0.98      1282

[[327  16]
 [  5 934]]


{'epoch': 8.0,
 'eval_accuracy': 0.983619344773791,
 'eval_loss': 0.10702450573444366,
 'eval_macro_f1': 0.9788859478854185,
 'eval_macro_precision': 0.9840488268864933,
 'eval_macro_recall': 0.9740139780238887,
 'eval_runtime': 19.2898,
 'eval_samples_per_second': 66.46}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
trainer._save("/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02")
tokenizer.save_pretrained("/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02")

('/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02/tokenizer_config.json',
 '/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02/special_tokens_map.json',
 '/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02/vocab.txt',
 '/content/drive/MyDrive/Arabic_FakeNews/Model/Balance/SavedModels/AraBERTV02/added_tokens.json')

# Test

In [None]:
df_test = pd.read_csv("drive/My Drive/Arabic_FakeNews/Model/Balance/DataDescr/test_news.csv", header=0,encoding='utf-8')

In [None]:
df_test.text = df_test.text.apply(lambda x:   arabert_prep.preprocess(x))

In [None]:
df_test.label= df_test.label.apply(lambda x: hard_map[x])
test_dataset = BERTDataset(df_test.text.to_list(),df_test.label.to_list(),model_name, max_len,label_map)


In [None]:
trainer.evaluate(test_dataset)

              precision    recall  f1-score   support

           0       0.97      0.94      0.96       400
           1       0.98      0.99      0.99      1203

    accuracy                           0.98      1603
   macro avg       0.98      0.97      0.97      1603
weighted avg       0.98      0.98      0.98      1603

[[ 376   24]
 [  10 1193]]


{'epoch': 8.0,
 'eval_accuracy': 0.9787897691827823,
 'eval_loss': 0.1300640106201172,
 'eval_macro_f1': 0.9713467078838349,
 'eval_macro_precision': 0.9771863198811312,
 'eval_macro_recall': 0.9658437240232751,
 'eval_runtime': 23.3512,
 'eval_samples_per_second': 68.647}

# Predict

In [None]:
class Dataset():
    def __init__(self, text=None, target= None, model_name = None, max_len= None, label_map= None):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask)

In [None]:

title_example = pd.read_csv("drive/My Drive/Arabic_FakeNews/Model/Balance/Data/title_example.csv")
title_example.shape

count=0
for title in title_example.text:
   test_dataset = Dataset([title],None,model_name, max_len,label_map)
   y_label,_,_ = trainer.predict(test_dataset)
   preds = np.argmax(y_label, axis=1)   
   print(preds)
   if preds==1 :
    count=count+1

    
print("Real : "+str(count))

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[1]
[0]
[0]
[0]
[0]
[0]
[0]
Real : 2


In [None]:
text_example = pd.read_csv("drive/My Drive/Arabic_FakeNews/Model/Balance/Data/text_example.csv")
text_example.shape
count=0

for text in text_example.text:
   text_dataset = Dataset([text],None,model_name, max_len,label_map)
   y_label,_,_ = trainer.predict(text_dataset)
   preds = np.argmax(y_label, axis=1)   
   print(preds)
   if preds==1 :
    count=count+1
    
print("Real : "+str(count))

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
Real : 0


In [None]:
#  ----- 3. Predict -----#
# # Load test data
# test_data = pd.("drive/My Drive/Arabic_FakeNews/Model/Data/test_news.csv", header=0,encoding='utf-8')
# X_test = list(test_data["text"])
# X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# # Create torch dataset
# test_dataset = Dataset(X_test_tokenized)

# # Load trained model
# model_path = "output/checkpoint-50000"
# model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# # Define test trainer
# test_trainer = Trainer(model)

# # Make prediction
# raw_pred, _, _ = test_trainer.predict(test_dataset)

# # Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)