In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import pickle
import numpy as np
import time
import random
import joblib
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from tensorflow.keras.layers import Input
from keras.layers import Embedding, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, InputLayer
from tensorflow.keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import Sequence
from keras.initializers import glorot_normal
from keras.callbacks import ModelCheckpoint

from transformers import AutoTokenizer, TFAutoModel

In [None]:
import tensorflow as tf
print("GPU available:", tf.test.is_gpu_available())
print("GPU device name:", tf.test.gpu_device_name())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU available: True
GPU device name: /device:GPU:0


In [None]:
with open('/content/drive/My Drive/NLPdata/train.txt','r') as file:
    train_data = file.readlines()

val_data_raw = None
with open('/content/drive/My Drive/NLPdata/val.txt','r') as file:
    val_data = file.readlines()

print('Training data length:', len(train_data))
print('Validation data length:', len(val_data))

Training data length: 50000
Validation data length: 2500


In [None]:
diacritics = ['َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ', 'َّ', 'ًّ', 'ُّ', 'ٌّ', 'ِّ', 'ٍّ', '']

arabic_chars = ['ئ', 'ط', 'ه', 'ك', 'ض', 'ج', 'ذ', 'ع', 'ب', 'ل', 'د', 'ت', 'ا', 'ث', 'أ', 'س', 'ق', 'م', 'ش', 'ز', 'غ', 'ى', 'إ', 'خ', 'ن', 'آ', 'ؤ', 'ي', 'ظ', 'ص', 'ح', 'ة', 'و', 'ر', 'ء', 'ف']
arabic_chars_space = list(arabic_chars) + [' ']
arabic_chars_valid = list(arabic_chars) + [' '] + diacritics

# char_mapping = {' ': 0,
#     'ا': 1, 'ب': 2, 'ت': 3, 'ث': 4, 'ج': 5, 'ح': 6, 'خ': 7, 'د': 8, 'ذ': 9, 'ر': 10, 'ز': 11, 'س': 12, 'ش': 13, 'ص': 14,
#     'ض': 15, 'ط': 16, 'ظ': 17, 'ع': 18, 'غ': 19, 'ف': 20, 'ق': 21, 'ك': 22, 'ل': 23, 'م': 24, 'ن': 25, 'ه': 26, 'و': 27,
#     'ى': 28, 'ي': 29,'ء': 30, 'آ': 31, 'أ': 32, 'ؤ': 33, 'إ': 34, 'ئ': 35,'ة': 36,
#     '٠': 37, '١': 38, '٢': 39, '٣': 40, '٤': 41, '٥': 42, '٦': 43, '٧': 44, '٨': 45, '٩': 46,
#     '0': 47, '1': 48, '2': 49, '3': 50, '4': 51, '5': 52, '6': 53,'7': 54, '8': 55, '9': 56,
#     '<pad>': 57, '<s>': 58, '</s>': 59
# }

char_mapping = {' ': 0,
    'ا': 1, 'ب': 2, 'ت': 3, 'ث': 4, 'ج': 5, 'ح': 6, 'خ': 7, 'د': 8, 'ذ': 9, 'ر': 10, 'ز': 11, 'س': 12, 'ش': 13, 'ص': 14,
    'ض': 15, 'ط': 16, 'ظ': 17, 'ع': 18, 'غ': 19, 'ف': 20, 'ق': 21, 'ك': 22, 'ل': 23, 'م': 24, 'ن': 25, 'ه': 26, 'و': 27,
    'ى': 28, 'ي': 29,'ء': 30, 'آ': 31, 'أ': 32, 'ؤ': 33, 'إ': 34, 'ئ': 35,'ة': 36,
    '٠': 37, '١': 38, '٢': 39, '٣': 40, '٤': 41, '٥': 42, '٦': 43, '٧': 44, '٨': 45, '٩': 46,
    '0': 47, '1': 48, '2': 49, '3': 50, '4': 51, '5': 52, '6': 53,'7': 54, '8': 55, '9': 56,
    '<pad>': 57, '<s>': 58, '</s>': 59,
   '.':60,',':61,'،': 62,':':63,';':64,'؛':65,'(':66,')':67,'[': 68,']':69,'{': 70,'}': 71,'«': 72,'»': 73,'-': 74, '!': 75, '?': 76,'؟': 77,
    '\n': 78, '"': 79, '&': 80, "'": 81, '*': 82, '+': 83, '/': 84, '=': 85,  '_': 86, '`': 87, '~': 88,'\u200d': 89, '\u200f': 90, '–': 91,
    '’': 92, '“': 93, '…': 94, '﴾': 95, '﴿': 96
}

class_mapping = {'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ':
9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14}

reverse_class_mapping = {0:'َ', 1:'ً', 2:'ُ', 3:'ٌ', 4:'ِ', 5:'ٍ', 6:'ْ',7:'ّ',8: 'َّ',9: 'ًّ',10: 'ُّ',11: 'ٌّ',12: 'ِّ',13: 'ٍّ',14: ''}


punctionations_splitting ={'.':'.\n',',':',\n','،': '،\n',':':':\n',';':';\n','؛':'؛\n','(':'\n(',')':')\n',
                           '[': '\n[',']':']\n','{': '\n{','}': '}\n','«': '\n«','»': '»\n',
                           '-': '-\n', '!': '!\n', '?': '?\n', '؟': '؟\n',}

In [None]:
def remove_diacritics(data):
    return data.translate(str.maketrans('', '', ''.join(diacritics)))

test_str = 'قَوْلُهُ : ( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ ) قَالَ الزَّرْكَشِيُّ'
print(remove_diacritics(test_str))


قوله : ( أو قطع الأول يده إلخ ) قال الزركشي


In [None]:
def one_hot_matrix(data, size):
    one_hot_matrix = [[1 if j == i else 0 for j in range(size)] for i in data]
    return one_hot_matrix

test = [0,1,2,3]
print(one_hot_matrix(test, 4))

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]


In [None]:
def one_hot_vector(index , size):
    one_hot_vector = [1 if j == index else 0 for j in range(size)]
    return one_hot_vector

one_hot_vector(1,5)

[0, 1, 0, 0, 0]

In [None]:
def split_using_punctuation(data):

  splitted_data = list()

  for sentence in data:
        for punc in punctionations_splitting:
          sentence = sentence.replace(punc, punctionations_splitting[punc])
        splitted_data += sentence.split('\n')

  return splitted_data

print(split_using_punctuation(train_data[0:2]))

['قَوْلُهُ :', ' ', '( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ )', ' قَالَ الزَّرْكَشِيُّ', '( 14 / 123 )', '', '', 'ابْنُ عَرَفَةَ :', ' قَوْلُهُ :', ' بِلَفْظٍ يَقْتَضِيه كَإِنْكَارِ غَيْرِ حَدِيثٍ بِالْإِسْلَامِ وُجُوبَ مَا عُلِمَ وُجُوبُهُ مِنْ الدِّينِ ضَرُورَةً ', '( كَإِلْقَاءِ مُصْحَفٍ بِقَذَرٍ وَشَدِّ زُنَّارٍ )', ' ابْنُ عَرَفَةَ :', ' قَوْلُ ابْنِ شَاسٍ :', ' أَوْ بِفِعْلٍ يَتَضَمَّنُهُ هُوَ كَلُبْسِ الزُّنَّارِ وَإِلْقَاءِ الْمُصْحَفِ فِي صَرِيحِ النَّجَاسَةِ وَالسُّجُودِ لِلصَّنَمِ وَنَحْوِ ذَلِكَ ', '( وَسِحْرٍ )', ' مُحَمَّدٌ :', ' قَوْلُ مَالِكٍ وَأَصْحَابِهِ أَنَّ السَّاحِرَ كَافِرٌ بِاَللَّهِ تَعَالَى قَالَ مَالِكٌ :', ' هُوَ كَالزِّنْدِيقِ إذَا عَمِلَ السِّحْرَ بِنَفْسِهِ قُتِلَ وَلَمْ يُسْتَتَبْ .', '', '']


In [None]:
def split_on_length(data):

    max_len = 500

    splitted_data = list()

    for sentence in data:

       new_sentence = remove_diacritics(sentence).strip()

       if len(new_sentence) != 0:

          if len(new_sentence) > 0 and len(new_sentence) <= max_len:
                  splitted_data.append(sentence.strip())

          else:
            sentence_words = sentence.split()
            temp_sentence = ''

            for word in sentence_words:


              # if we add the word, it will exceed length, so don't add this word and take the sentence
              if len(remove_diacritics(temp_sentence).strip()) + len(remove_diacritics(word).strip()) + 1 > max_len:
                  if len(remove_diacritics(temp_sentence).strip()) > 0:
                      splitted_data.append(temp_sentence.strip())

                  # make a new sentence
                  temp_sentence = word

              else:
                  # it will not exceed, add the word to the sentence
                  temp_sentence = word if temp_sentence == '' else temp_sentence + ' ' + word

            if len(remove_diacritics(temp_sentence).strip()) > 0:
                  splitted_data.append(temp_sentence.strip())

    return splitted_data

In [None]:
split_punctuation_train_data = split_using_punctuation(train_data)
split_length_train_data      = split_on_length(split_punctuation_train_data)

split_punctuation_val_data = split_using_punctuation(val_data)
split_length_val_data      = split_on_length(split_punctuation_val_data)


print('Training data length:', len(split_length_train_data))
print('Validation data length:', len(split_length_val_data))

print(split_length_train_data[0:5])

Training data length: 305772
Validation data length: 15701
['قَوْلُهُ :', '( أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ )', 'قَالَ الزَّرْكَشِيُّ', '( 14 / 123 )', 'ابْنُ عَرَفَةَ :']


In [None]:
print('Training data max:', max(len(remove_diacritics(item).strip()) for item in split_length_train_data))
print('Validation data max:', max(len(remove_diacritics(item).strip()) for item in split_length_val_data))

print('Training data min:', min(len(remove_diacritics(item).strip()) for item in split_length_train_data))
print('Validation data min:', min(len(remove_diacritics(item).strip()) for item in split_length_val_data))

Training data max: 500
Validation data max: 500
Training data min: 1
Validation data min: 1


In [None]:
# list of short sentences -> with diarictic & without punc or numbers
clean_diac_train_data = [(''.join(char for char in text if char in arabic_chars_valid)).strip() for text in split_length_train_data]
clean_diac_val_data = [(''.join(char for char in text if char in arabic_chars_valid)).strip() for text in split_length_val_data]

clean_diac_train_data = [item for item in clean_diac_train_data if item != ""]
clean_diac_val_data   = [item for item in clean_diac_val_data if item != ""]

print('Training data length:', len(clean_diac_train_data))
print('Validation data length:', len(clean_diac_val_data))

print(clean_diac_train_data[0:5])

Training data length: 280228
Validation data length: 14385
['قَوْلُهُ', 'أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ', 'قَالَ الزَّرْكَشِيُّ', 'ابْنُ عَرَفَةَ', 'قَوْلُهُ']


In [None]:
# list of short sentences -> without diarictic & without punc or numbers

clean_train_data = [remove_diacritics(text) for text in clean_diac_train_data]
clean_val_data = [remove_diacritics(text) for text in clean_diac_val_data]

print('Training data length:', len(clean_train_data))
print('Validation data length:', len(clean_val_data))

print(clean_train_data[0:5])

Training data length: 280228
Validation data length: 14385
['قوله', 'أو قطع الأول يده إلخ', 'قال الزركشي', 'ابن عرفة', 'قوله']


In [None]:
tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
model1 = TFAutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/437M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-ca.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
def arabert_embeddings(word):
  # tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
  # model = TFAutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
  # print(word)
  input_ids = tokenizer(word, return_tensors='tf')['input_ids']
  # print(input_ids.shape)
  # print(input_ids)


  # input_ids = tf.expand_dims(input_ids, axis=0)

  output = model1(input_ids)

  # Extract the embedding vector for the [CLS] token (first token)
  embedding_vector = output.last_hidden_state[:, 0, :]

  return embedding_vector

In [None]:
embs = arabert_embeddings('قوله')
print(len(embs))
print(arabert_embeddings('قوله').numpy().tolist()[0])
print(arabert_embeddings('<s>').numpy().tolist()[0])

1
[-0.055517181754112244, -0.4046120345592499, -1.138106346130371, 0.522519588470459, -1.1426318883895874, -1.1428852081298828, -0.24164246022701263, -0.3720070719718933, -0.36858105659484863, -1.048858880996704, -0.26931947469711304, -0.00537511333823204, 1.6390174627304077, -0.22852042317390442, -0.925045371055603, -0.13003510236740112, 0.3928563594818115, 0.8530811667442322, 0.9281281232833862, -0.27608954906463623, 0.6170013546943665, 0.5237986445426941, 0.2616967260837555, -0.5918275713920593, 0.38471969962120056, 1.0287684202194214, -0.3360046446323395, 0.04804738610982895, -0.9024965763092041, -0.11470675468444824, -1.2118324041366577, -0.33421581983566284, 0.9631841778755188, -0.42580825090408325, 0.9480172395706177, 0.7508987188339233, 1.2456271648406982, 0.10308174788951874, -0.38794606924057007, -0.12103541195392609, -1.7332165241241455, -0.3665558993816376, 0.4816020727157593, -0.5660207271575928, -0.5163562893867493, -0.11628599464893341, -0.34035781025886536, 0.0071126744

In [None]:
def get_sentence_classes(sentence):

  x = []
  y = []

  # unk_emb = get_word_embeddings(['<unk>'])[0]

  vec = []
  vec = one_hot_vector(char_mapping['<s>'],len(char_mapping))
  vec.extend(arabert_embeddings('<s>').numpy().tolist()[0])
  x.append(vec)

  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  for word in sentence.split():
    emb = arabert_embeddings(remove_diacritics(word)).numpy().tolist()[0]

    # if (len(emb2) == 0):
    #     emb = unk_emb
    # else:
    #     emb = emb2[0]

    # if word in punctionations_splitting:
    #   emb = unk_emb
    # else:
    #   if (len(emb2) == 0):
    #     emb = unk_emb
    #   else:
    #     emb = emb2[0]


    for index, char in enumerate(word):

      if char not in diacritics: # arabic char or space

        vec = []
        vec = one_hot_vector(char_mapping[char],len(char_mapping))
        vec.extend(emb)
        x.append(vec)

        char_diacritic = ''
        sentence_len = len(sentence)

        if index + 1 < sentence_len:
          if sentence[index + 1] in diacritics:
            char_diacritic = sentence[index + 1]

            if index + 2 < sentence_len:
               char_diacritic = char_diacritic + sentence[index + 2] if sentence[index + 2] in diacritics and (char_diacritic + sentence[index + 2] in class_mapping) else sentence[index + 2] + char_diacritic if sentence[index + 2] in diacritics and (sentence[index + 2] + char_diacritic in class_mapping) else char_diacritic


        y.append(one_hot_vector(class_mapping[char_diacritic],len(class_mapping)))

    # vec = []
    # vec = one_hot_vector(char_mapping[' '],len(char_mapping))
    # vec.extend(unk_emb)
    # x.append(vec)
    # y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  vec = []
  vec = one_hot_vector(char_mapping['</s>'],len(char_mapping))
  vec.extend(arabert_embeddings('</s>').numpy().tolist()[0])
  x.append(vec)

  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  assert(len(x) == len(y))

  return x, y

In [None]:
def get_sentence_classes_test(sentence):

  x = []
  y = []

  # unk_emb = get_word_embeddings(['<unk>'])[0]

  vec = []
  vec = one_hot_vector(char_mapping['<s>'],len(char_mapping))
  vec.extend(arabert_embeddings('<s>').numpy().tolist()[0])
  x.append(vec)

  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  for word in sentence.split():

    emb = arabert_embeddings(remove_diacritics(word)).numpy().tolist()[0]

    # if (len(emb2) == 0):
    #     emb = unk_emb
    # else:
    #     emb = emb2[0]

    # if word in punctionations_splitting:
    #   emb = unk_emb
    # else:
    #   if (len(emb2) == 0):
    #     emb = unk_emb
    #   else:
    #     emb = emb2[0]


    for index, char in enumerate(word):

      if char not in diacritics: # arabic char or space

        vec = []
        vec = one_hot_vector(char_mapping[char],len(char_mapping))
        vec.extend(emb)
        x.append(vec)

        char_diacritic = ''
        sentence_len = len(sentence)

        if index + 1 < sentence_len:
          if sentence[index + 1] in diacritics:
            char_diacritic = sentence[index + 1]

            if index + 2 < sentence_len:
              char_diacritic = char_diacritic + sentence[index + 2] if sentence[index + 2] in diacritics and (char_diacritic + sentence[index + 2] in class_mapping) else sentence[index + 2] + char_diacritic if sentence[index + 2] in diacritics and (sentence[index + 2] + char_diacritic in class_mapping) else char_diacritic


        y.append(one_hot_vector(class_mapping[char_diacritic],len(class_mapping)))

    vec = []
    vec = one_hot_vector(char_mapping[' '],len(char_mapping))
    vec.extend(arabert_embeddings(' ').numpy().tolist()[0])
    x.append(vec)
    y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  vec = []
  vec = one_hot_vector(char_mapping['</s>'],len(char_mapping))
  vec.extend(arabert_embeddings('</s>').numpy().tolist()[0])
  x.append(vec)

  y.append(one_hot_vector(class_mapping[''],len(class_mapping)))

  assert(len(x) == len(y))

  return x, y

In [None]:
def get_classes(data):

  X = []
  Y = []

  for sentence in data:
    x, y = get_sentence_classes(sentence)
    X.append(x)
    Y.append(y)

  X = np.asarray(X)
  Y = np.asarray(Y)

  return X, Y

In [None]:
def get_classes_test(data):

  X = []
  Y = []

  for sentence in data:
    x, y = get_sentence_classes_test(sentence)
    X.append(x)
    Y.append(y)

  X = np.asarray(X)
  Y = np.asarray(Y)

  return X, Y

In [None]:
print(clean_diac_train_data[0:2])

X,Y = get_classes(clean_diac_train_data[0:2])

print(X.shape)
print(Y.shape)
print(len(X[0]))
print(len(Y[0]))

['قَوْلُهُ', 'أَوْ قَطَعَ الْأَوَّلُ يَدَهُ إلَخْ']
(2,)
(2,)
6
6


  X = np.asarray(X)
  Y = np.asarray(Y)


In [None]:
class custom_data_generator(Sequence):

    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.data) / float(self.batch_size)))

    def __getitem__(self, index):

        start_index = index * self.batch_size
        end_index = (index + 1) * self.batch_size

        batch = self.data[start_index : end_index]

        X_batch, Y_batch = get_classes(batch)

        max_length_X = np.max([len(x) for x in X_batch])
        max_length_Y = np.max([len(y) for y in Y_batch])

        assert(max_length_X == max_length_Y)

        vec = []
        vec = one_hot_vector(char_mapping['<pad>'],len(char_mapping))
        vec.extend(arabert_embeddings('<pad>').numpy().tolist()[0])

        X = []
        for x in X_batch:
          padding_length = max_length_X - len(x)
          x = list(x)
          x.extend([vec] * (padding_length))
          X.append(np.asarray(x))

        Y = []
        for y in Y_batch:
          padding_length = max_length_Y - len(y)
          y = list(y)
          y.extend([one_hot_vector(class_mapping[''],len(class_mapping))] * (padding_length))
          # y.extend(one_hot_matrix([class_mapping['']] * (padding_length), len(class_mapping)))
          Y.append(np.asarray(y))

        X, Y = np.asarray(X), np.asarray(Y)

        # print('===================================> X:', X.shape)
        # print('===================================> Y:', Y.shape)

        return X, Y

In [None]:
def build_model():

   model = Sequential()
   model.add(InputLayer(input_shape=(None, 768+len(char_mapping))))

   model.add(Bidirectional(LSTM(units=256,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   model.add(Dropout(0.5))
   model.add(Bidirectional(LSTM(units=256,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   model.add(Dropout(0.5))
   model.add(Bidirectional(LSTM(units=256,return_sequences=True,kernel_initializer=glorot_normal(seed=500))))
   model.add(TimeDistributed(Dense(units=512,activation='relu',kernel_initializer=glorot_normal(seed=500))))
   model.add(TimeDistributed(Dense(units=512,activation='relu',kernel_initializer=glorot_normal(seed=500))))
   model.add(TimeDistributed(Dense(units=len(class_mapping),activation='softmax',kernel_initializer=glorot_normal(seed=500))))
   model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
   return model

In [None]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, None, 512)         2297856   
 al)                                                             
                                                                 
 dropout_37 (Dropout)        (None, None, 512)         0         
                                                                 
 bidirectional_1 (Bidirecti  (None, None, 512)         1574912   
 onal)                                                           
                                                                 
 dropout_38 (Dropout)        (None, None, 512)         0         
                                                                 
 bidirectional_2 (Bidirecti  (None, None, 512)         1574912   
 onal)                                                           
                                                        

In [None]:
def fit_model(model, epochs, batch_size, train_data, val_data):

    random.shuffle(train_data)
    random.shuffle(val_data)

    train_data = list(sorted(train_data, key=lambda item: len(remove_diacritics(item))))
    val_data   = list(sorted(val_data,   key=lambda item: len(remove_diacritics(item))))

    checkpoint_path = 'checkpoints/epoch{epoch:02d}.ckpt'
    checkpoint_cb = ModelCheckpoint(checkpoint_path, verbose=0)
    training_generator = custom_data_generator(train_data, batch_size)
    val_generator = custom_data_generator(val_data, batch_size)

    history =  model.fit(training_generator,validation_data=val_generator,epochs=epochs,callbacks=[checkpoint_cb])
    return history

In [None]:
start_time = time.time()

history =fit_model(model, 1, 256, clean_diac_train_data, clean_diac_val_data)
end_time = time.time()

training_accuracy = history.history['accuracy']
validation_accuracy = history.history['val_accuracy']

print('Final Training Accuracy:', training_accuracy[-1])
print('Final Validation Accuracy:', validation_accuracy[-1])

print('%s seconds' % round(end_time - start_time, 2))

  X = np.asarray(X)
  Y = np.asarray(Y)


  44/1095 [>.............................] - ETA: 142:44:12 - loss: 1.4351 - accuracy: 0.4308

In [None]:
joblib.dump(model, 'arabert.joblib')
filename = 'arabert.sav'
pickle.dump(model, open(filename, 'wb'))