# Imports

In [None]:
# install PyArabic library for Arabic preprocessing
!pip install PyArabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyArabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 4.6 MB/s 
Installing collected packages: PyArabic
Successfully installed PyArabic-0.6.15


In [None]:
# install tensorflow_addons for AdamW optimizer
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.5 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.1


In [None]:
# Standard libraries imports
import gc
import io
import re
import ast
import math
import glob
import pickle
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from lxml import etree
import pyarabic.araby as araby

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import layers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
all_files = glob.glob('/content/drive/MyDrive/NLP_Course/New Corpus/sharded/*')
train_files = []
test_files = []
for file in all_files:
  if 'train' in file:
    train_files.append(file)
  elif 'test' in file:
    test_files.append(file)

In [None]:
# full_test_corpus = ''
# for file in tqdm(test_files, total=len(test_files)):
#   with open(file, 'rb') as f:
#     text = f.read().decode()
#     full_test_corpus += ' ' + text
#   del text
#   gc.collect()

# full_test_corpus = full_test_corpus.replace('\n', ' ')
# print('Number of words in the corpus:', len(full_test_corpus.split()))
# print('Number of characters in the corpus:', len(full_test_corpus))
# full_test_corpus = full_test_corpus.replace(' ', '#')

# chars_to_remove = ['،', '.', ':', '؟', '/', '؛', '=']
# for char in tqdm(chars_to_remove):
#   full_test_corpus = full_test_corpus.replace(char, '#')

In [None]:
all_unique_words = []
for file in tqdm(test_files, total=len(test_files)):
  with open(file, 'rb') as f:
    text = f.read().decode()

  text = text.replace('\n', ' ')
  text = text.replace(' ', '#')

  chars_to_remove = ['،', '.', ':', '؟', '/', '؛', '=']
  for char in chars_to_remove:
    text = text.replace(char, '#')
  text_as_list = list(set(text.split('#')))

  all_unique_words.extend(text_as_list)
  del text, text_as_list
  gc.collect()

100%|██████████| 256/256 [03:23<00:00,  1.26it/s]


In [None]:
final_unique_words = list(set(all_unique_words))

In [None]:
len(final_unique_words)

1136706

In [None]:
final_unique_words[:50]

['',
 'تسرقون',
 'تساكن',
 'أوترشت',
 'للطيبرسي',
 'توعدون',
 'يرزء',
 'وبالاتكال',
 'وبالمروي',
 'واستركبوه',
 'ونزوءا',
 'قفين',
 'النخيب',
 'وانحنائه',
 'للترف',
 'وسيضم',
 'والمعانات',
 'للطباع',
 'ينجسك',
 'كاتفاق',
 'سيتكاثر',
 'فيتحبب',
 'وبطانيات',
 'الطموية',
 'الرقع',
 'بخزازى',
 'لرواية',
 'إفينزا',
 'املاه',
 'وبمضمونه',
 'كامانجار',
 'فأفزعتها',
 'بثعلب',
 'إلدرسون',
 'لأقتلهم',
 'إجتهاد',
 'القواديس',
 'تاونزفيل',
 'سجادة',
 'إرضائها',
 'والقيقاء',
 'مألفا',
 'ومنقطعات',
 'اعجاب',
 'مرائي',
 'نفشوه',
 'أصابعى',
 'لعزكم',
 'معرت',
 'تغرينا']

In [None]:
max_len = 0
min_len = 100
for word in final_unique_words:
  if len(word) > max_len:
    max_len = len(word)
    longest_word = word
  if len(word) < min_len:
    min_len = len(word)
    shortest_word = word
  
print('Max length:', max_len, 'Word is:', longest_word)
print('Min length:', min_len, 'Word is:', shortest_word)

Max length: 87 Word is: زماااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااااان
Min length: 0 Word is: 


In [None]:
# exclude outliers
max_len = 15
min_len = 3
for word in tqdm(final_unique_words):
  if len(word) > max_len:
    final_unique_words.remove(word)
  if len(word) < min_len:
    final_unique_words.remove(word)

# detect new words    
max_len = 0
min_len = 100
for word in final_unique_words:
  if len(word) > max_len:
    max_len = len(word)
    longest_word = word
  if len(word) < min_len:
    min_len = len(word)
    shortest_word = word
  
print('Max length:', max_len, 'Word is:', longest_word)
print('Min length:', min_len, 'Word is:', shortest_word)

100%|█████████▉| 1134950/1135892 [01:57<00:00, 9666.11it/s]


Max length: 16 Word is: والحكميةوالشرعية
Min length: 3 Word is: ذأم


In [None]:
# text_as_list = full_test_corpus.split('#')
# del full_test_corpus
# gc.collect()
# split_len = len(text_as_list)

In [None]:
# all_unique_words = []
# for word in tqdm(text_as_list, total=split_len):
#   if word not in all_unique_words and len(word) >= 5:
#     all_unique_words.append(word)

In [None]:
# all_unique_chars = []
# for char in tqdm(full_test_corpus, total=len(full_test_corpus)):
#   if char not in all_unique_chars:
#     all_unique_chars.append(char)

100%|██████████| 410118332/410118332 [04:02<00:00, 1690429.99it/s]


In [None]:
# all_unique_chars

['#',
 'أ',
 'ف',
 'ض',
 'ل',
 'ا',
 'ص',
 'ة',
 'ب',
 'ع',
 'د',
 'ر',
 'ي',
 '،',
 'و',
 'ج',
 '.',
 'ه',
 'م',
 'ث',
 'ت',
 'ذ',
 'ك',
 'ح',
 'ى',
 'ق',
 'ط',
 'س',
 ':',
 'ش',
 'ن',
 'خ',
 'غ',
 'ظ',
 'آ',
 'إ',
 'ء',
 'ز',
 '؟',
 'ئ',
 'ؤ',
 '/',
 '؛',
 '=',
 '|']

In [None]:
# final_unique_char = []
# chars_to_remove = []
# for char in all_unique_chars:
#   print(char)
#   add = input()
#   if int(add) == 1:
#      final_unique_char.append(char)
#   else:
#     chars_to_remove.append(char)
# print(len(final_unique_char))
# print(len(chars_to_remove))
# print(chars_to_remove)

#
1
أ
1
ف
1
ض
1
ل
1
ا
1
ص
1
ة
1
ب
1
ع
1
د
1
ر
1
ي
1
،
0
و
1
ج
1
.
0
ه
1
م
1
ث
1
ت
1
ذ
1
ك
1
ح
1
ى
1
ق
1
ط
1
س
1
:
0
ش
1
ن
1
خ
1
غ
1
ظ
1
آ
1
إ
1
ء
1
ز
1
؟
0
ئ
1
ؤ
1
/
0
؛
0
=
0
|
1
38
7
['،', '.', ':', '؟', '/', '؛', '=']


In [None]:
final_unique_char = ['#',
                      'أ',
                      'ف',
                      'ض',
                      'ل',
                      'ا',
                      'ص',
                      'ة',
                      'ب',
                      'ع',
                      'د',
                      'ر',
                      'ي',
                      'و',
                      'ج',
                      'ه',
                      'م',
                      'ث',
                      'ت',
                      'ذ',
                      'ك',
                      'ح',
                      'ى',
                      'ق',
                      'ط',
                      'س',
                      'ش',
                      'ن',
                      'خ',
                      'غ',
                      'ظ',
                      'آ',
                      'إ',
                      'ء',
                      'ز',
                      'ئ',
                      'ؤ',
                      '|']

In [None]:
chars_to_remove = ['،', '.', ':', '؟', '/', '؛', '=']

In [None]:
# for char in tqdm(chars_to_remove):
#   full_test_corpus = full_test_corpus.replace(char, '#')

100%|██████████| 7/7 [00:04<00:00,  1.71it/s]


In [None]:
# final_unique_char

['#',
 'أ',
 'ف',
 'ض',
 'ل',
 'ا',
 'ص',
 'ة',
 'ب',
 'ع',
 'د',
 'ر',
 'ي',
 'و',
 'ج',
 'ه',
 'م',
 'ث',
 'ت',
 'ذ',
 'ك',
 'ح',
 'ى',
 'ق',
 'ط',
 'س',
 'ش',
 'ن',
 'خ',
 'غ',
 'ظ',
 'آ',
 'إ',
 'ء',
 'ز',
 'ئ',
 'ؤ',
 '|']

# Configs

In [None]:
# # configurations class
# class config:

#   MAXLEN = 128 # maximum length of sequence

# Dataset

In [None]:
# list of all Arabic characters
print('Number of Arabic Characters:', len(final_unique_char))

# character to index dictionary
char_to_index = dict((char, index+3) for (index, char) in enumerate(final_unique_char))
# index to character dictionary
index_to_char=  dict((index+3, char) for (index, char) in enumerate(final_unique_char))

char_to_index['$'] = 0 # pad
char_to_index['#'] = 1 # separator
char_to_index['_'] = 2 # mask


index_to_char[0] = '$' # pad
index_to_char[1] = '#' # separator
index_to_char[2] = '_' # mask

Number of Arabic Characters: 38


In [None]:
# # convert the pure text into a list of sequences
# text_as_list = []
# for i in tqdm(range(0, len(full_test_corpus)-config.MAXLEN, 128)):
#   text_as_list.append([full_test_corpus[i:i+config.MAXLEN]])

100%|██████████| 3204049/3204049 [00:06<00:00, 489158.34it/s]


In [None]:
# def text_to_index(lists):
#     # construct a list that includes the character-based tokenized input words
#     # for the encoder part
#     bert_indexed_inputs = []
    
#     # iterate over the lists
#     for text in tqdm(lists):
#       indexes = []
#       for char in list(text[0]):
#         indexes.append(char_to_index[char]) # try to get the index from the first dictionary
#       bert_indexed_inputs.append(indexes)
#     return bert_indexed_inputs
# # tokenize the dataset by relating each token to its index
# indexed_text = text_to_index(text_as_list)

100%|██████████| 3204049/3204049 [01:36<00:00, 33260.82it/s]


In [None]:
def text_to_index(text_list):
    # construct a list that includes the character-based tokenized input words
    # for the encoder part
    bert_indexed_inputs = []
    
    # iterate over the lists
    for word in tqdm(text_list):
      word_as_list = ['#'] + list(word) + ['#'] # add '#' for the separator
      indexes = []
      for char in word_as_list:
        indexes.append(char_to_index[char])
      indexes.reverse()
      bert_indexed_inputs.append(indexes)
    return bert_indexed_inputs
# tokenize the dataset by relating each token to its index
indexed_text = text_to_index(final_unique_words)

100%|██████████| 1134950/1134950 [00:06<00:00, 181651.85it/s]


In [None]:
# # train-validation-test split
# # 90% for trianing, 5% for validation and 5% for test 
# train_len = int(0.9 * len(indexed_text))
# train_data = indexed_text[:train_len]

# valid_len = int(0.05 * len(indexed_text))
# valid_data = indexed_text[train_len:train_len+valid_len]

# test_data = indexed_text[train_len+valid_len:]

# del indexed_text, text_as_list
# gc.collect()

In [None]:
# train-validation-test split
# 90% for trianing, 5% for validation and 5% for test 
train_len = int(0.9 * len(indexed_text))
train_data = indexed_text[:train_len]

valid_len = int(0.05 * len(indexed_text))
valid_data = indexed_text[train_len:train_len+valid_len]

test_data = indexed_text[train_len+valid_len:]

del indexed_text
gc.collect()

88

In [None]:
len(train_data), len(valid_data), len(test_data)

(1021455, 56747, 56748)

In [None]:
# def prepare_data(indexes):
#   '''
#   Args: list of characters in the form of indexes
#   Returns: gold labels, masked labels, mask
#   '''
#   y_true = indexes # first output
#   # build a probability distribution
#   # give all chars weights of 1
#   # and give the separator token a weight of 0.005
#   # then divide by the sum to normalize and make
#   # the probabilities sum up to 1
  # prob_1 = np.float32(np.array(indexes) == 1)*0.005
  # prob_2 = np.float32(np.array(indexes) != 1)
  # prob = prob_1 + prob_2
#   prob = prob / np.sum(prob)
#   indices_to_mask = np.random.choice(np.arange(128), size=(19,), replace=False, p=prob)
#   one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, 128), axis=0), dtype=tf.int32).numpy() # second output

#   y_masked = np.array(y_true.copy())

#   y_masked[indices_to_mask] = 2
#   y_masked = y_masked.tolist()

#   return y_true, y_masked, one_hot_mask.tolist()

In [None]:
def prepare_data(indexes):
  '''
  Args: list of characters in the form of indexes
  Returns: gold labels, masked labels, mask
  '''
  y_true = indexes # first output
  indexes_len = len(indexes)

  prob_1 = np.float32(np.array(indexes) == 1) * 0
  prob_2 = np.float32(np.array(indexes) != 1)
  prob = prob_1 + prob_2
  prob = prob / np.sum(prob)

  if indexes_len >= 5 and indexes_len <= 7:
    indices_to_mask = np.random.choice(np.arange(indexes_len), size=(1,), replace=False, p=prob)
    one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, indexes_len), axis=0), dtype=tf.int32).numpy() # second output
    y_masked = np.array(y_true.copy())
    y_masked[indices_to_mask] = 2
    y_masked = y_masked.tolist()

  elif indexes_len > 5 and indexes_len <= 8:
    indices_to_mask = np.random.choice(np.arange(indexes_len), size=(2,), replace=False, p=prob)
    one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, indexes_len), axis=0), dtype=tf.int32).numpy() # second output
    y_masked = np.array(y_true.copy())
    y_masked[indices_to_mask] = 2
    y_masked = y_masked.tolist()

  elif indexes_len > 8 and indexes_len <= 13:
    indices_to_mask = np.random.choice(np.arange(indexes_len), size=(3,), replace=False, p=prob)
    one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, indexes_len), axis=0), dtype=tf.int32).numpy() # second output
    y_masked = np.array(y_true.copy())
    y_masked[indices_to_mask] = 2
    y_masked = y_masked.tolist()

  elif indexes_len > 13:
    indices_to_mask = np.random.choice(np.arange(indexes_len), size=(4,), replace=False, p=prob)
    one_hot_mask = tf.cast(tf.reduce_sum(tf.one_hot(indices_to_mask, indexes_len), axis=0), dtype=tf.int32).numpy() # second output
    y_masked = np.array(y_true.copy())
    y_masked[indices_to_mask] = 2
    y_masked = y_masked.tolist()

  return y_true, y_masked, one_hot_mask.tolist()

In [None]:
# construct the train and valid
# true labels, masked labels, and masks

train_true_labels = [] # true labels list
train_masked_labels = [] # masked labels list
train_masks = [] # masks
# iterate over the training dataset
for indexed_example in tqdm(train_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  train_true_labels.append(y_true)
  train_masked_labels.append(y_masked)
  train_masks.append(one_hot_mask)

valid_true_labels = [] # true labels list
valid_masked_labels = [] # masked labels list
valid_masks = [] # masks
# iterate over the validation dataset
for indexed_example in tqdm(valid_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  valid_true_labels.append(y_true)
  valid_masked_labels.append(y_masked)
  valid_masks.append(one_hot_mask) 

test_true_labels = [] # true labels list
test_masked_labels = [] # masked labels list
test_masks = [] # masks
# iterate over the validation dataset
for indexed_example in tqdm(test_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  test_true_labels.append(y_true)
  test_masked_labels.append(y_masked)
  test_masks.append(one_hot_mask) 

100%|██████████| 1021455/1021455 [07:50<00:00, 2172.86it/s]
100%|██████████| 56747/56747 [00:25<00:00, 2212.31it/s]
100%|██████████| 56748/56748 [00:25<00:00, 2200.99it/s]


In [None]:
# construct a the validation dataframe initialized with zeros
train_zeros = np.zeros((len(train_true_labels), 3))
train_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=train_zeros)
del train_zeros
gc.collect()
train_df = train_df.astype('str')

# populate the dataframe with the true labels, masked labels, and masks
for i in tqdm(range(len(train_masked_labels))):
  train_df.iloc[i, 0] = train_masked_labels[i]
  train_df.iloc[i, 1] = train_true_labels[i]
  train_df.iloc[i, 2] = train_masks[i]

# write the dataframe to a CSV file
train_df.to_csv('final_train_df.csv', index=False)
del train_df
gc.collect()
# move valid_df.csv to drive
!mv /content/final_train_df.csv /content/drive/MyDrive/NLP_Course

In [None]:
# construct a the validation dataframe initialized with zeros
valid_zeros = np.zeros((len(valid_true_labels), 3))
valid_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=valid_zeros)
del valid_zeros
gc.collect()
valid_df = valid_df.astype('str')

# populate the dataframe with the true labels, masked labels, and masks
for i in tqdm(range(len(valid_masked_labels))):
  valid_df.iloc[i, 0] = valid_masked_labels[i]
  valid_df.iloc[i, 1] = valid_true_labels[i]
  valid_df.iloc[i, 2] = valid_masks[i]

# write the dataframe to a CSV file
valid_df.to_csv('final_valid_df.csv', index=False)
del valid_df
gc.collect()
# move valid_df.csv to drive
!mv /content/final_valid_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 56747/56747 [00:10<00:00, 5499.43it/s]


In [None]:
test_true_labels = [] # true labels list
test_masked_labels = [] # masked labels list
test_masks = [] # masks
# iterate over the validation dataset
for indexed_example in tqdm(test_data):
  y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
  test_true_labels.append(y_true)
  test_masked_labels.append(y_masked)
  test_masks.append(one_hot_mask) 

# construct a the test dataframe initialized with zeros
test_zeros = np.zeros((len(test_true_labels), 3))
test_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=test_zeros)
del test_zeros
gc.collect()
test_df = test_df.astype('str')

# populate the dataframe with the true labels, masked labels, and masks
for i in tqdm(range(len(test_masked_labels))):
  test_df.iloc[i, 0] = test_masked_labels[i]
  test_df.iloc[i, 1] = test_true_labels[i]
  test_df.iloc[i, 2] = test_masks[i]

# write the dataframe to a CSV file
test_df.to_csv('final_test_df.csv', index=False)
del test_df
gc.collect()
# move test_df.csv to drive
!mv /content/final_test_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 56748/56748 [00:26<00:00, 2140.93it/s]
100%|██████████| 56748/56748 [00:10<00:00, 5495.15it/s]


In [None]:
# chunks = 5
# for chunk in range(chunks):
#   print(f'Processing chunk {chunk+1}')
#   # construct the train and valid
#   # true labels, masked labels, and masks
#   train_chuck = train_data[chunk*(len(train_data)//5):(len(train_data)//5)*(chunk+1)]
#   train_true_labels = [] # true labels list
#   train_masked_labels = [] # masked labels list
#   train_masks = [] # masks
#   # iterate over the training dataset
#   for indexed_example in tqdm(train_chuck):
#     y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
#     train_true_labels.append(y_true)
#     train_masked_labels.append(y_masked)
#     train_masks.append(one_hot_mask)

#   # construct a the training dataframe initialized with zeros
#   train_zeros = np.zeros((len(train_true_labels), 3))
#   train_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=train_zeros)
#   del train_zeros
#   gc.collect()
#   train_df = train_df.astype('str')

#   # populate the dataframe with the true labels, masked labels, and masks
#   for i in tqdm(range(len(train_masked_labels))):
#     train_df.iloc[i, 0] = train_masked_labels[i]
#     train_df.iloc[i, 1] = train_true_labels[i]
#     train_df.iloc[i, 2] = train_masks[i]

#   # write the dataframe to a CSV file
#   train_df.to_csv(f'new_train_df_chunk{chunk+1}.csv', index=False)
#   del train_df, train_true_labels, train_masked_labels, train_masks
#   gc.collect()
# # move train_df.csv to drive
# # !mv /content/new_train_df.csv /content/drive/MyDrive/NLP_Course

Processing chunk 1


100%|██████████| 576728/576728 [04:51<00:00, 1981.37it/s]
100%|██████████| 576728/576728 [01:58<00:00, 4853.86it/s]


Processing chunk 2


100%|██████████| 576728/576728 [04:48<00:00, 1999.53it/s]
100%|██████████| 576728/576728 [01:56<00:00, 4934.56it/s]


Processing chunk 3


100%|██████████| 576728/576728 [04:54<00:00, 1960.59it/s]
100%|██████████| 576728/576728 [02:02<00:00, 4715.48it/s]


Processing chunk 4


100%|██████████| 576728/576728 [04:53<00:00, 1967.58it/s]
100%|██████████| 576728/576728 [02:05<00:00, 4601.06it/s]


Processing chunk 5


100%|██████████| 576728/576728 [05:03<00:00, 1898.87it/s]
100%|██████████| 576728/576728 [01:56<00:00, 4935.55it/s]


In [None]:
# # move train_df.csv to drive
# !mv /content/new* /content/drive/MyDrive/NLP_Course

In [None]:
# valid_true_labels = [] # true labels list
# valid_masked_labels = [] # masked labels list
# valid_masks = [] # masks
# # iterate over the validation dataset
# for indexed_example in tqdm(valid_data):
#   y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
#   valid_true_labels.append(y_true)
#   valid_masked_labels.append(y_masked)
#   valid_masks.append(one_hot_mask)

# # construct a the validation dataframe initialized with zeros
# valid_zeros = np.zeros((len(valid_true_labels), 3))
# valid_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=valid_zeros)
# del valid_zeros
# gc.collect()
# valid_df = valid_df.astype('str')

# # populate the dataframe with the true labels, masked labels, and masks
# for i in tqdm(range(len(valid_masked_labels))):
#   valid_df.iloc[i, 0] = valid_masked_labels[i]
#   valid_df.iloc[i, 1] = valid_true_labels[i]
#   valid_df.iloc[i, 2] = valid_masks[i]

# # write the dataframe to a CSV file
# valid_df.to_csv('new_valid_df.csv', index=False)
# del valid_df
# gc.collect()
# # move valid_df.csv to drive
# !mv /content/new_valid_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 160202/160202 [01:29<00:00, 1794.45it/s]
100%|██████████| 160202/160202 [00:32<00:00, 4972.76it/s]


In [None]:
# test_true_labels = [] # true labels list
# test_masked_labels = [] # masked labels list
# test_masks = [] # masks
# # iterate over the validation dataset
# for indexed_example in tqdm(test_data):
#   y_true, y_masked, one_hot_mask = prepare_data(indexed_example)
#   test_true_labels.append(y_true)
#   test_masked_labels.append(y_masked)
#   test_masks.append(one_hot_mask) 

# # construct a the test dataframe initialized with zeros
# test_zeros = np.zeros((len(test_true_labels), 3))
# test_df = pd.DataFrame(columns=['x', 'y', 'mask'], data=test_zeros)
# del test_zeros
# gc.collect()
# test_df = test_df.astype('str')

# # populate the dataframe with the true labels, masked labels, and masks
# for i in tqdm(range(len(test_masked_labels))):
#   test_df.iloc[i, 0] = test_masked_labels[i]
#   test_df.iloc[i, 1] = test_true_labels[i]
#   test_df.iloc[i, 2] = test_masks[i]

# # write the dataframe to a CSV file
# test_df.to_csv('new_test_df.csv', index=False)
# del test_df
# gc.collect()
# # move test_df.csv to drive
# !mv /content/new_test_df.csv /content/drive/MyDrive/NLP_Course

100%|██████████| 160203/160203 [01:20<00:00, 1992.56it/s]
100%|██████████| 160203/160203 [00:32<00:00, 4977.74it/s]
