In [11]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras import utils
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization



In [12]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [13]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

raw_train_ds = preprocessing.text_dataset_from_directory(
   '/content/drive/MyDrive/MasterThesis/Datasets/Offensive2020/train',
    batch_size=BATCH_SIZE,
    )

raw_test_ds = preprocessing.text_dataset_from_directory(
    '/content/drive/MyDrive/MasterThesis/Datasets/Offensive2020/test',
     batch_size=BATCH_SIZE)

raw_val_ds = preprocessing.text_dataset_from_directory(
    '/content/drive/MyDrive/MasterThesis/Datasets/Offensive2020/dev',
     batch_size=BATCH_SIZE)

Found 7000 files belonging to 2 classes.
Found 2000 files belonging to 2 classes.
Found 1000 files belonging to 2 classes.


## print out a few examples

In [14]:
for text, label in raw_train_ds.take(1):
   for i in range(5):
        print(text.numpy()[i].decode('utf-8').strip())
        print(label.numpy()[i])
        print('--------------------------------')

اياكس يا جمالك يا روعتك يا ابداعك ادوهم البطولة ذات الاوذنين.. يستاهلو 🏆	NOT_OFF	NOT_HS
1
--------------------------------
دفعتك كلهم يا متجوزين يا مخطوبين يا مرتبطين انما انت اقول ايه جبلة. URL	OFF	NOT_HS
0
--------------------------------
@USER الف سلامه عليك يا فرجانى بيه ساسي يا حبيب كل الزمالكاويه 💓💓	NOT_OFF	NOT_HS
1
--------------------------------
طول ما انا قاعد ف الامتحان بييجى ف دماغى فيلم ولاد رزق والواد القصير اللى كان ف الكباريه ده عمال يقول رضا يا رضا ادلع يا رضا URL	NOT_OFF	NOT_HS
1
--------------------------------
RT @USER: بس يا انغام يا لي لحالي او العدم .. URL	NOT_OFF	NOT_HS
1
--------------------------------


## The Represention for each label !

In [15]:
for i, label in enumerate(raw_train_ds.class_names):
  print("Label", i, "corresponds to", label)

Label 0 corresponds to HS
Label 1 corresponds to NOT_HS


In [16]:
# BUFFER_SIZE = 10000
# BATCH_SIZE = 32

In [17]:
train_dataset = raw_train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = raw_test_ds.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [19]:
# for example, label in train_dataset.take(1):
#   print('texts: ', example.numpy()[:3].decode('utf-8').strip())
#   print()
#   print('labels: ', label.numpy()[:3])

# Prepare the dataset for training



1.    standardization



In [22]:
VOCAB_SIZE = 1000

# binary_vectorize_layer = TextVectorization(
#     max_tokens=VOCAB_SIZE,
#     output_mode='binary')
# MAX_SEQUENCE_LENGTH = 250
import string
import re
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    # stripped_html = tf.strings.regex_replace(lowercase, "<LF>", " ")
    stripped_html = tf.strings.regex_replace(lowercase, "^[a-zA-Z]$", " ")
    


    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )
int_vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    # output_sequence_length=MAX_SEQUENCE_LENGTH
    )

In [23]:
# call adapt to fit the state of the prepro cessing layer to the dataset. This will cause the model to build an index of strings to integers
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda text, labels: text)
# binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [24]:
#See the result of using these layers to preprocess data
# def binary_vectorize_text(text, label):
#   text = tf.expand_dims(text, -1)
#   return binary_vectorize_layer(text), label
def int_vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return int_vectorize_layer(text), label

In [25]:
# # Retrieve a batch (of 32 tweet and labels) from the dataset
# text_batch, label_batch = next(iter(raw_train_ds))
# first_tweet, first_label = text_batch[0], label_batch[0]
# print("tweet", first_tweet)
# print("Label", first_label)

In [26]:
# print("'int' vectorized tweets:",
#       int_vectorize_text(first_tweet, first_label)[0])

In [27]:
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[12])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[21])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

1289 --->  في
313 --->  انا
Vocabulary size: 1000


In [28]:
int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [29]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [30]:
# int_train_ds = configure_dataset(int_train_ds)
# int_val_ds = configure_dataset(int_val_ds)
# int_test_ds = configure_dataset(int_test_ds)

In [31]:
for text, label in raw_train_ds.take(1):
   for i in range(5):
        print(text.numpy()[i].decode('utf-8').strip())
        print(label.numpy()[i])
        print('--------------------------------')

RT @USER: الصبح الك يا الله..🙏 <LF>يا فتاح.🌞. <LF>يا عليم🕔..<LF>يا رزاق🤲..<LF>يا كريم💞..<LF>صباح مشرق متل وجك يا شمس ...☀️<LF>@USER URL	NOT_OFF	NOT_HS
1
--------------------------------
- بكام الشيميز ده لو سمحت ؟<LF>= 420 جنية يا فندم .<LF>- يا ماماااااااااااااااااااااااااااااااااااااااا<LF>= خلاص يا فندم بنهزر معاكي ده هدية من المحل والله	NOT_OFF	NOT_HS
1
--------------------------------
يا غير عن الكل يا أغلى محبيني(;	NOT_OFF	NOT_HS
1
--------------------------------
اللهم نَوْر قلبي ، ويَسّر أمري ، واجعلني في ألطاف أسرار حفظك يا حافظ يا حفيظ يا خير الحافظين ❤️	NOT_OFF	NOT_HS
1
--------------------------------
RT @USER: يا نعم يا الاد الرويبي كلهم / شيبانهم و اللي بعد صغار URL	NOT_OFF	NOT_HS
1
--------------------------------


# Create the model

In [32]:
# model = tf.keras.Sequential([
#     int_vectorize_layer,
#     tf.keras.layers.Embedding(
#         VOCAB_SIZE,
#         64,
#         # Use masking to handle the variable sequence lengths
#         mask_zero=True),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1)
# ])

model = tf.keras.Sequential([
    int_vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim=len(int_vectorize_layer.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
# def create_model(vocab_size, num_labels):
  # model = tf.keras.Sequential([
      # layers.Embedding(vocab_size, 64, mask_zero=True),
      # layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      # layers.GlobalMaxPooling1D(),
      # layers.Dense(num_labels)
  # ])
  # return model

In [33]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [34]:
# predict on a sample text without padding.

sample_text = ('أسألك يا حليماً ذا أناة يا من لا يعرف عباده منه إلا الجميل يا رب الأولين والأخريين'
               'يا رَبِّ لا أشكي سواك.. ربٍّ انت السميع العليم بما يجول بخاطري ')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.0085406]


In [35]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              # loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])



# Train the model


In [None]:
history = model.fit(raw_train_ds, validation_data=raw_val_ds, epochs=5,validation_steps=30)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
