### Imporrint necessary libraries

In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib as plt

import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
import tensorflow_text
tf.keras.mixed_precision.set_global_policy('mixed_float16')
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
from langdetect import detect


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3060 Laptop GPU, compute capability 8.6
1 Physical GPUs, 1 Logical GPUs


### Loading The Data

In [2]:
path_to_train_data="../Dataset/train_data.csv"
path_to_test_data="../Dataset/test_data.csv"
train_data= pd.read_csv(path_to_train_data)
test_data= pd.read_csv(path_to_test_data)
train_data.head()

Unnamed: 0,campaign_id,comment_id,comment_description,sentiment
0,2212,17908351952371091,لخسارة الوزن الزائد والكرش بمدة قياسية مع عدم ...,Negative
1,2217,17935944230085744,🔥🔥🔥,Positive
2,2215S,17899518356507020,This is so good😍 would be great it If you add ...,Negative
3,2214,18014766136389857,😍,Positive
4,2203,17924318627206870,طبق رائع ومميز تبارك الرحمن تسلم ايدك يارب 😍,Positive


- dataset caracteristics

In [3]:
print(train_data.dtypes)
train_data.groupby('sentiment').describe()

campaign_id            object
comment_id              int64
comment_description    object
sentiment              object
dtype: object


Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Irrelevant,2.0,1.803449e+16,27272420000000.0,1.80152e+16,1.802484e+16,1.803449e+16,1.804413e+16,1.805377e+16
Negative,1082.0,1.799547e+16,121523200000000.0,1.78432e+16,1.791969e+16,1.795389e+16,1.802626e+16,1.840129e+16
Positive,4416.0,1.799213e+16,117728700000000.0,1.784217e+16,1.791955e+16,1.795206e+16,1.801864e+16,1.840678e+16


### Loading and building Bert Model

In [4]:
model_path= "./Bert_model/bert_cased"

def define_model(model_path):
    # Loading model and preprocessor    
    input_word_ids = Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(None,), dtype=tf.int32, name="input_mask")
    input_type_ids = Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
    bert_layer = hub.KerasLayer(model_path, trainable=True,name='Bert_encoder')
    bert_outputs = bert_layer([input_word_ids,input_mask,input_type_ids])
    output_layer = Dense(1, activation='sigmoid',name='classifier')(bert_outputs[0])  # Binary classification, change units for multi-class
    model = Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output_layer)
    return model
bert_classifier=define_model(model_path)
bert_classifier.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 Bert_encoder (KerasLayer)      [(None, 768),        177853441   ['input_word_ids[0][0]',         
                                 (None, None, 768)]               'input_mask[0][0]',         

### Preprocessing Data

- Removing non arabic and non english comment from the dataset

In [5]:
# Remove sentiments with irrelevant class because they are only two
irrelevant_indexes=train_data[train_data['sentiment']=='Irrelevant'].index
train_data.drop(irrelevant_indexes,inplace=True)

# Setting the limit for comments 
long_indexes= train_data[train_data['comment_description'].str.len()>=100].index
train_data.drop(long_indexes,inplace=True)

# remove non english and non arabic comments 
i=0
while i <= len(train_data):
    try:
        lang= detect(train_data.iloc[i,2])
        if lang !='ar' and lang !='eng':
            train_data.drop(i,inplace=True)
            i=i+1
        else:
            i=i+1
            continue
    except:
        i=i+1
        continue
train_data.groupby('sentiment').describe()

Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Negative,748.0,1.799555e+16,120789000000000.0,1.784369e+16,1.792342e+16,1.795275e+16,1.802685e+16,1.840129e+16
Positive,3424.0,1.799249e+16,117160200000000.0,1.784217e+16,1.791995e+16,1.795221e+16,1.801843e+16,1.840678e+16


- Cleaning Strings

In [6]:
# blance data
positive_data= train_data[train_data['sentiment']=='Positive'][:748]
negative_data= train_data[train_data['sentiment']=='Negative']
balanced_train_data=pd.concat([positive_data,negative_data])
balanced_train_data=balanced_train_data.sample(frac=1)
len(negative_data), len(positive_data),len(balanced_train_data)

(748, 748, 1496)

- Splitting comment and sentiments

In [7]:
train_comments=balanced_train_data['comment_description'].astype(str).values
#changing sentiment labels from (negativ, positive) to (0,1)
train_labels=balanced_train_data['sentiment'].astype(str)
train_labels,uniques=pd.factorize(train_labels.values)

- Tokenizing the data

In [8]:
preprocessor_path="./Bert_model/bert_multi_cased_preprocessor"
preprocessor = hub.load(preprocessor_path)

# Tokenize batches of both text inputs.
text_premises = tf.constant(train_comments)
tokenized_premises = preprocessor.tokenize(text_premises)

# Pack input sequences for the Transformer encoder.
seq_length = 64
# convert to word_id and adding padding and mask_id
def tokenize_data(tokenized_premises):
    with tf.device('/CPU:0'):
        encoder_inputs = preprocessor.bert_pack_inputs(
            [tokenized_premises],
            seq_length=seq_length)  # Optional argument.
    return [encoder_inputs['input_word_ids'],encoder_inputs['input_mask'],encoder_inputs['input_type_ids']]
tokenizeed_comments = tokenize_data(tokenized_premises)
tokenizeed_comments

[<tf.Tensor: shape=(1496, 64), dtype=int32, numpy=
 array([[   101,  71426, 100745, ...,      0,      0,      0],
        [   101,  79660,  16498, ...,      0,      0,      0],
        [   101,  15764,    791, ...,      0,      0,      0],
        ...,
        [   101,    776,  11145, ...,      0,      0,      0],
        [   101,  60844,  10429, ...,      0,      0,      0],
        [   101,    787,  12497, ...,      0,      0,      0]])>,
 <tf.Tensor: shape=(1496, 64), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>,
 <tf.Tensor: shape=(1496, 64), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])>]

### Evaluate pretrained Bert model

In [11]:
pred = bert_classifier.predict(['nice one'])

ValueError: in user code:

    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\kyorakuna\miniconda3\envs\tf\lib\site-packages\keras\engine\input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>]


### Fine tune bert model

- Model compiling and Hyperparameters congiguration

In [9]:
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
opt= tf.keras.optimizers.Adam(learning_rate=0.1)

bert_classifier.compile(optimizer=opt,loss='binary_crossentropy', metrics=metrics)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


- Training the model

In [11]:
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to use CPU only
# model fit on the data
epochs=3
batch_size = 1

bert_classifier.fit(
    x={'input_word_ids':tokenizeed_comments[0],'input_mask':tokenizeed_comments[1],'input_type_ids':tokenizeed_comments[2]},
    y=train_labels,
    epochs=epochs,
    batch_size=batch_size
)

### Evaluate fine tuned model

### Export the model

In [None]:
bert_classifier.save('../backend/')