### Import libraries

In [None]:
import os
#Colab configuration for tpu usage
os.environ["TFHUB_MODEL_LOAD_FORMAT"] = "UNCOMPRESSED"

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib as plt

import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
import tensorflow_text
from langdetect import detect


### TPU initialisation
for colab only

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

# Data preparation
- Load data
- Clean data
- Prepare data for model training

### Load data

In [None]:
# For colab usage
from google.colab import drive
drive.mount('/content/drive')
path_to_train_data="./train_data.csv"
path_to_test_data="./test_data.csv"

#For local machine usage
#path_to_train_data="../Dataset/train_data.csv"
#path_to_test_data="../Dataset/test_data.csv"

input_data= pd.read_csv(path_to_train_data)
inference_data= pd.read_csv(path_to_test_data)
input_data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,campaign_id,comment_id,comment_description,sentiment
0,2212,17908351952371091,لخسارة الوزن الزائد والكرش بمدة قياسية مع عدم ...,Negative
1,2217,17935944230085744,🔥🔥🔥,Positive
2,2215S,17899518356507020,This is so good😍 would be great it If you add ...,Negative
3,2214,18014766136389857,😍,Positive
4,2203,17924318627206870,طبق رائع ومميز تبارك الرحمن تسلم ايدك يارب 😍,Positive


- dataset caracteristics

In [None]:
print(input_data.dtypes)
input_data.groupby('sentiment').describe()

campaign_id            object
comment_id              int64
comment_description    object
sentiment              object
dtype: object


Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Irrelevant,2.0,1.803449e+16,27272420000000.0,1.80152e+16,1.802484e+16,1.803449e+16,1.804413e+16,1.805377e+16
Negative,1082.0,1.799547e+16,121523200000000.0,1.78432e+16,1.791969e+16,1.795389e+16,1.802626e+16,1.840129e+16
Positive,4416.0,1.799213e+16,117728700000000.0,1.784217e+16,1.791955e+16,1.795206e+16,1.801864e+16,1.840678e+16


### Data Cleaning and Data Transformation
- Remove NAN Values
- Removing Irrelevant class comments
- Remove hashtags and mentions
- Removing too long and too short comments
- Removing non arabic and non english comment from the dataset

- Remove NAN Values

In [None]:
input_data=input_data.dropna()
input_data.isna().value_counts()

campaign_id  comment_id  comment_description  sentiment
False        False       False                False        5497
dtype: int64

- Remove sentiments with irrelevant class because they negligeable

In [None]:
irrelevant_indexes=input_data[input_data['sentiment']=='Irrelevant'].index
input_data.drop(irrelevant_indexes,inplace=True)

- Remove mentions and hashtags and numbers
- Remove repetitif letters, charachters and imojis

In [None]:
# remove mentions and hashtags and numbers
import re
def clean_comments(text):
    words= text.split()
    cleaned_text=""
    for word in words:
        if word.startswith("#"):   # remove hashtags
            cleaned_text=cleaned_text + ""
        elif word.startswith("@"):  # remove mentions
            cleaned_text=cleaned_text + ""
        else:
            cleaned_text=cleaned_text +" "+ word
    cleaned_text = re.sub("\d+", " ", cleaned_text) # remove number
    cleaned_text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', cleaned_text)  # remove punctuation
    cleaned_text = re.sub('\s+', ' ', cleaned_text) # remove extra space
    return cleaned_text

# remove repetitif letters and repetitif imojis
def remove_repetion(text):
    i=2
    text_lenght=len(text)
    while i < text_lenght:
        # compare every character with two characters before
        if text[i]==text[i-1] and text[i] == text[i-2]:
            text = text[:i]+text[i+1:]
            text_lenght=text_lenght-1
        else:
            i=i+1
    return text

In [None]:
input_data['comment_description']=input_data['comment_description'].apply(clean_comments)
input_data['comment_description']=input_data['comment_description'].apply(remove_repetion)

- Removing Comments that are too long or too short

In [None]:
# Setting a limit lenght of min=3 and max=200 characters for comments
long_indexes= input_data[input_data['comment_description'].str.len()>=200].index
short_indexes= input_data[input_data['comment_description'].str.len()<=3].index
input_data.drop(long_indexes,inplace=True)
input_data.drop(short_indexes,inplace=True)
print(input_data.dtypes)
input_data.groupby('sentiment').describe()

campaign_id            object
comment_id              int64
comment_description    object
sentiment              object
dtype: object


Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Negative,991.0,1.799642e+16,122436900000000.0,1.78432e+16,1.792139e+16,1.79538e+16,1.802712e+16,1.840129e+16
Positive,3455.0,1.799235e+16,117588000000000.0,1.784217e+16,1.791992e+16,1.795201e+16,1.801803e+16,1.840678e+16


- Remove non english and non arabic comments

In [None]:
i=0
while i <= len(input_data):
    try:
        lang= detect(input_data.iloc[i,2])
        if lang !='ar' and lang !='eng':
            input_data.drop(i,inplace=True)
            i=i+1
        else:
            i=i+1
            continue
    except:
        i=i+1
        continue
input_data.groupby('sentiment').describe()

Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Negative,808.0,1.799668e+16,122791000000000.0,1.78432e+16,1.792172e+16,1.795377e+16,1.802513e+16,1.840129e+16
Positive,2785.0,1.799179e+16,117111500000000.0,1.784217e+16,1.791989e+16,1.795097e+16,1.801752e+16,1.840678e+16


### Preparing data as model inputs
- Classe balancing
- Splitting dataframe into comments and sentiments
- Encoding sentiment labels into numerical values (negative=0, positive=1)
- Tokenizing comments

In [None]:
# blance data
negative_data_lenght=len(input_data[input_data['sentiment']=='Negative'])
positive_data= input_data[input_data['sentiment']=='Positive'][:negative_data_lenght]
negative_data= input_data[input_data['sentiment']=='Negative']

# test/train split
split_percentage= int(0.85*len(positive_data))

balanced_train_data=pd.concat([positive_data[:split_percentage],negative_data[:split_percentage]])
balanced_test_data=pd.concat([positive_data[split_percentage:],negative_data[split_percentage:]])

val_split_percentage= int(0.1*len(balanced_train_data))
balanced_validation_data = pd.concat([positive_data[:val_split_percentage],negative_data[:val_split_percentage]])
balanced_train_data=pd.concat([positive_data[val_split_percentage:],negative_data[val_split_percentage:]])

balanced_train_data=balanced_train_data.sample(frac=1)
balanced_validation_data=balanced_validation_data.sample(frac=1)
balanced_test_data=balanced_test_data.sample(frac=1)

len(negative_data), len(positive_data),len(balanced_train_data),len(balanced_test_data)

(808, 808, 1342, 244)

- Splitting comment and sentiments and encoding sentiment labels

In [None]:
train_comments = balanced_train_data['comment_description'].astype(str).values
validation_comments= balanced_validation_data['comment_description'].astype(str).values
test_comments = balanced_test_data['comment_description'].astype(str).values

#changing sentiment labels from (negativ, positive) to (0,1)
train_labels=balanced_train_data['sentiment'].astype(str)
validation_labels= balanced_validation_data['sentiment'].astype(str)
train_labels,train_uniques=pd.factorize(train_labels.values)
validation_labels,validation_uniques=pd.factorize(validation_labels.values)

test_labels=balanced_test_data['sentiment'].astype(str)
test_labels,test_uniques=pd.factorize(test_labels.values)


- Tokenizing the data

In [None]:
# Colab path
preprocessor_path="https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-preprocess/versions/3"

# Local machine path
#preprocessor_path="./Bert_model/bert_multi_cased_preprocessor"

def tokenize_data(preprocessor_path,train_comments):
    # loading Tokenizer model
    preprocessor = hub.load(preprocessor_path)

    # Tokenize text inputs.
    text_premises = tf.constant(train_comments)
    tokenized_premises = preprocessor.tokenize(text_premises)
    seq_length = 64
    # convert to word_id and adding padding and mask_id
    with tf.device('/CPU:0'):
        encoder_inputs = preprocessor.bert_pack_inputs(
            [tokenized_premises],
            seq_length=seq_length)  # Optional argument.
    return [encoder_inputs['input_word_ids'],encoder_inputs['input_mask'],encoder_inputs['input_type_ids']]
tokenized_comments = tokenize_data(preprocessor_path,train_comments)
validation_tokenized_comments=tokenize_data(preprocessor_path,validation_comments)
tokenized_comments

[<tf.Tensor: shape=(1342, 64), dtype=int32, numpy=
 array([[   101,  10747,  10134, ...,      0,      0,      0],
        [   101,    100,  59901, ...,      0,      0,      0],
        [   101,  60844, 104317, ...,      0,      0,      0],
        ...,
        [   101,    766,  32219, ...,      0,      0,      0],
        [   101,  37282,  46267, ...,      0,      0,      0],
        [   101,  14269,  26894, ...,      0,      0,      0]], dtype=int32)>,
 <tf.Tensor: shape=(1342, 64), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 <tf.Tensor: shape=(1342, 64), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], d

# Fine tune bert model
- Model Building
- Model compile
- Model training
- Model Evaluation
- Model Saving
- Inference


### Loading and building Bert Model

In [None]:
# Colab path for bert model
model_path= "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-l-12-h-768-a-12/versions/4"

# Local machine path for bert path
#model_path= "./Bert_model/bert_cased"

def define_model(model_path):
    # Input of the model (tokenized data)
    input_word_ids = Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(None,), dtype=tf.int32, name="input_mask")
    input_type_ids = Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
    # Loading the bert model as Keras layer using tensorflow_hub
    bert_layer = hub.KerasLayer(model_path, trainable=True,name='Bert_encoder')
    bert_outputs = bert_layer({'input_word_ids':input_word_ids,'input_mask':input_mask,'input_type_ids':input_type_ids})

    # Output layer that contains sigmoid activation to turn outputs values between 0 and 1
    output_layer = Dense(1, activation='sigmoid',name='classifier')(bert_outputs["pooled_output"])  # Binary classification, change units for multi-class
    model = Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output_layer)
    return model

### Model compiling and Hyperparameters congiguration

In [None]:
opt= tf.keras.optimizers.Adam(learning_rate=0.00005)

with strategy.scope():
  bert_classifier=define_model(model_path)
  bert_classifier.compile(optimizer=opt,loss='binary_crossentropy', metrics=["binary_accuracy"])

### Training the model

In [None]:
# model fit on the data
epochs=15
batch_size = 32

history = bert_classifier.fit(
    x={'input_word_ids':tokenized_comments[0],'input_mask':tokenized_comments[1],'input_type_ids':tokenized_comments[2]},
    y=train_labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=({'input_word_ids': validation_tokenized_comments[0],
         'input_mask': validation_tokenized_comments[1],
         'input_type_ids': validation_tokenized_comments[2]},
        validation_labels)
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Model Evaluation

In [None]:
test_tokenized_comments = tokenize_data(preprocessor_path,test_comments)

bert_classifier.evaluate({'input_word_ids':test_tokenized_comments[0],'input_mask':test_tokenized_comments[1],'input_type_ids':test_tokenized_comments[2]},test_labels)



[0.07290926575660706, 0.9795082211494446]

### Export the model

In [None]:
save_options = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
bert_classifier.save('./classifier2/', include_optimizer=False,
                      options=save_options)

# local machine saving
#bert_classifier.save('../backend/')

### Model inference

- CSV File prediciton

In [None]:
# process Test input
inference_data['comment_description']=inference_data['comment_description'].apply(clean_comments)
inference_data['comment_description']=inference_data['comment_description'].apply(remove_repetion)

# Tokenizing the text to match model input
inference_tokenized_comments = tokenize_data(preprocessor_path,train_comments)

# predict class
classe_probability=classifier.predict({'input_word_ids':inference_tokenized_comments[0],'input_mask':inference_tokenized_comments[1],'input_type_ids':inference_tokenized_comments[2]})

if classe_probability<=0.5:
    sentiment='positive'
    print(sentiment)
else:
    sentiment='negative'
    print(sentiment)

- Text input inferenec

In [None]:
input_text='write here the text you want to analyse'

# process Test input
input_text=clean_comments()
input_text=remove_repetion()

# Tokenizing the text to match model input
inference_tokenized_comment = tokenize_data(preprocessor_path,input_text)

# predict class
classe_probability=classifier.predict({'input_word_ids':inference_tokenized_comments[0],'input_mask':inference_tokenized_comments[1],'input_type_ids':inference_tokenized_comments[2]})

if classe_probability<=0.5:
    sentiment='positive'
    print(sentiment)
else:
    sentiment='negative'
    print(sentiment)