# Imports

In [None]:
# install PyArabic library for Arabic preprocessing
!pip install PyArabic



In [None]:
# install tensorflow_addons for AdamW optimizer
!pip install tensorflow_addons



In [None]:
# Standard libraries imports
import io
import re
import ast
import math
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from lxml import etree
import pyarabic.araby as araby

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers

# Configs

In [None]:
# configurations class
class config:

  MAXLEN = 17 # maximum length of sequence

# Dataset

In [None]:
# list of all Arabic characters
chars = list('اٱبتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئىی')
print('Number of Arabic Characters:', len(chars))

# character to index dictionary
char_to_index = dict((char, index+3) for (index, char) in enumerate(chars))
# index to character dictionary
index_to_char=  dict((index+3, char) for (index, char) in enumerate(chars))

char_to_index['p'] = 0 # pad
char_to_index['s'] = 1 # separator
char_to_index['_'] = 2 # mask


index_to_char[0] = 'p' # pad
index_to_char[1] = 's' # separator
index_to_char[2] = '_' # mask

# additional char_to_index and index_to_char extracted manually as shown in next 
# cells. They were extracted and then saved into a CSV file
df = pd.read_csv('https://drive.google.com/uc?export=download&id=1lQFU7XFy82-1dE5kPK7yDLpJ6tleycCC')
chars_2 = df['char'].values.tolist()
indexes_2 = df['value'].values.tolist()
del df

# additional character to index dictionary
char_to_index_2 = dict(zip(chars_2, indexes_2))
# additional index to character dictionary
index_to_char_2 = dict(zip(indexes_2, chars_2))

Number of Arabic Characters: 38


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# download text_1 to be used for pretraining
url = 'https://drive.google.com/uc?export=download&id=1-A7OVKYutL6ZIUfy_yewUN68jK0VmhHd'
r = requests.get(url, allow_redirects=True)
open('Quran_listf.txt', 'wb').write(r.content)

# download text_2 to be used for pretraining
url = 'https://drive.google.com/uc?export=download&id=1-1IEr0bMwyAZHOKGpjsbMM6ttjnwQmDZ'
r = requests.get(url, allow_redirects=True)
open('corpus_filtred.txt', 'wb').write(r.content)

In [None]:
# read the text files
with open('/content/Quran_listf.txt') as f:
  text_1 = f.read()

# read the text files
with open('/content/corpus_filtred.txt') as f:
  text_2 = f.read()

# concatenate text files
text = text_1 + ' ' + text_2
# remove teh new line token
text = text.replace('\n', ' ')
# convert text to a list of words
text_as_list = text.split()
print('Number of tokens:', len(text_as_list))

Number of tokens: 2910526


In [None]:
# limit the maximum length of a token in the dataset to 15
# remove all outliers that lie beyond this region
# figure out the maximum and minimum length in the dataset after outlier removal
max_len = 0 # instantiate max length
min_len = 100 # instantiate min length
clean_text_list = [] # instantiate a list to collect cleaned text 

# iterate over each token in the dataset
for i, token in tqdm(enumerate(text_as_list), total=len(text_as_list)):
  if len(token) <= 15:
    clean_text_list.append(re.sub('\W+',' ', araby.strip_diacritics(token)).replace('ـ', 's').replace(' ', 's'))

    if len(token) > max_len:
      max_len = len(token)

    if len(token) < min_len:
      min_len = len(token)

# print max and min lengths
max_len, min_len

100%|██████████| 2910526/2910526 [00:28<00:00, 101072.18it/s]


(15, 1)

In [None]:
# construct a pure text from the clean list of tokens
pure_text = ''
for token in tqdm(clean_text_list):
  pure_text += token + 's'

100%|██████████| 2899236/2899236 [00:01<00:00, 1627766.76it/s]


### Additional characters (manual entry)

In [None]:
# pure_text_as_list = pure_text.split('s')
# all_chars = []
# for text in tqdm(pure_text_as_list):
#   temp_list = list(text)
#   all_chars.extend(temp_list)

100%|██████████| 2899422/2899422 [00:02<00:00, 1018281.96it/s]


In [None]:
# unique_chars = list(set(all_chars))
# len(unique_chars)

223

In [None]:
# #relate each character to the reference one
# char_to_index_2 = dict()
# for char in unique_chars:
#   print(char)
#   continue_ = input('0 or 1')
#   if continue_ == '0':
#     continue
#   similar_char = input('similar char')
#   similar_index = char_to_index[similar_char]
#   char_to_index_2[char] = similar_index
#   print('*'*50)

2
0 or 10
l
0 or 10
ﳑ
0 or 10
ﺌ
0 or 1ئ
similar charئ
**************************************************
ﺰ
0 or 11
similar charز
**************************************************
ﹰ
0 or 10
ﻆ
0 or 11
similar charظ
**************************************************
ﱘ
0 or 10
ﺔ
0 or 11
similar charة
**************************************************
ﻔ
0 or 11
similar charف
**************************************************
ب
0 or 11
similar charب
**************************************************
ف
0 or 11
similar charف
**************************************************
٤
0 or 10
ت
0 or 11
similar charت
**************************************************
ق
0 or 11
similar charق
**************************************************
6
0 or 10
ﺭ
0 or 11
similar charر
**************************************************
ﺦ
0 or 11
similar charخ
**************************************************
ﻚ
0 or 11
similar charك
**************************************************
ﺼ
0 or 11
similar charص
******

In [None]:
# #construct a dataframe with the new tokens
# pd.DataFrame({'char': list(char_to_index_2.keys()),
#               'value': list(char_to_index_2.values())}).to_csv('char_to_index_additional.csv', index=False)

# !mv /content/char_to_index_additional.csv /content/drive/MyDrive/NLP_Course

In [None]:
# convert the pure text into a list of sequences, each with 17 characters
# use a step of 5 to include 12 characters from the previous and next sequence
text_as_list = []
for i in tqdm(range(0, len(pure_text)-17, 5)):
  text_as_list.append([pure_text[i:i+17]])

100%|██████████| 2529792/2529792 [00:08<00:00, 300594.16it/s]


In [None]:
def text_to_index(lists):
    # construct a list that includes the character-based tokenized input words
    # for the encoder part
    bert_indexed_inputs = []
    
    # iterate over the lists
    for text in tqdm(lists):
      try:
        indexes = []
        for char in list(text[0]):
          try:
            indexes.append(char_to_index[char]) # try to get the index from the first dictionary
          except:
            indexes.append(char_to_index_2[char]) # except get the index from the second dictionary
        bert_indexed_inputs.append(indexes)
      except: # if the characters are not in the dictionary continue iterating
          continue
    return bert_indexed_inputs
# tokenize the dataset by relating each token to its index
indexed_text = text_to_index(text_as_list)

100%|██████████| 2529792/2529792 [00:06<00:00, 415584.76it/s]


In [None]:
# train-validation split
# 90% for trianing and 10% for validation
train_len = int(0.9 * len(indexed_text))
train_data = indexed_text[:train_len]
valid_data = indexed_text[train_len:]
del indexed_text, text_as_list

In [None]:
def prepare_data(indexes):
  '''
  Args: list of characters in the form of indexes
  Returns: gold labels, masked labels, mask
  '''
  y_true = indexes # first output
  indices_to_mask = np.random.choice(np.arange(17), size=(4,), replace=False)
  one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, 17), axis=0), dtype=tf.int32).numpy() # second output

  index_0 = indices_to_mask[0]
  index_1 = indices_to_mask[1]
  index_2 = indices_to_mask[2]
  index_3 = indices_to_mask[3]

  y_masked = y_true.copy()

  random_index = np.random.randint(3, len(chars)+3)
  # ensure the masking index is not the same as the replaced one 
  while index_2 == random_index:
    random_index = np.random.randint(3, len(chars)+3)


  y_masked[index_0] = 2 # third output (assign mask)
  y_masked[index_1] = 2 # third output (assing mask)
  y_masked[index_2] = random_index # third output (assign random token)

  return y_true, y_masked, one_hot_mask.tolist()

In [None]:
# construct the train and valid
# true labels, masked labels, and masks

train_true_labels = [] # true labels list
train_masked_labels = [] # masked labels list
train_masks = [] # masks
# iterate over the training dataset
for indexed_example in tqdm(train_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  train_true_labels.append(y_true)
  train_masked_labels.append(y_masked)
  train_masks.append(one_hot_mask)

valid_true_labels = [] # true labels list
valid_masked_labels = [] # masked labels list
valid_masks = [] # masks
# iterate over the validation dataset
for indexed_example in tqdm(valid_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  valid_true_labels.append(y_true)
  valid_masked_labels.append(y_masked)
  valid_masks.append(one_hot_mask) 

100%|██████████| 11176/11176 [00:07<00:00, 1566.36it/s]
100%|██████████| 1242/1242 [00:00<00:00, 2674.38it/s]


In [None]:
# construct a the training dataframe initialized with zeros
train_zeros = np.zeros((len(train_true_labels), 3))
train_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=train_zeros)
del train_zeros
train_df = train_df.astype('str')

# populate the dataframe with the true labels, masked labels, and masks
for i in tqdm(range(len(train_masked_labels))):
  train_df.iloc[i, 0] = train_masked_labels[i]
  train_df.iloc[i, 1] = train_true_labels[i]
  train_df.iloc[i, 2] = train_masks[i]

# write the dataframe to a CSV file
train_df.to_csv('train_df.csv', index=False)
del train_df
# move train_df.csv to drive
!mv /content/train_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 11176/11176 [00:05<00:00, 2151.93it/s]


In [None]:
# construct a the validation dataframe initialized with zeros
valid_zeros = np.zeros((len(valid_true_labels), 3))
valid_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=valid_zeros)
del valid_zeros
valid_df = valid_df.astype('str')

# populate the dataframe with the true labels, masked labels, and masks
for i in tqdm(range(len(valid_masked_labels))):
  valid_df.iloc[i, 0] = valid_masked_labels[i]
  valid_df.iloc[i, 1] = valid_true_labels[i]
  valid_df.iloc[i, 2] = valid_masks[i]

# write the dataframe to a CSV file
valid_df.to_csv('valid_df.csv', index=False)
del valid_df
# move valid_df.csv to drive
!mv /content/valid_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 1242/1242 [00:00<00:00, 4456.17it/s]
