# Fine Tune the AlephBert model for Hebrew Sentence Similarity

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Setting the path for our model and dataset

In [2]:
#path to location where the notebook and the dataset is saved
%cd /content/drive/MyDrive/ML_projects_work/AlephBert

/content/drive/MyDrive/ML_projects_work/AlephBert


## Install the transformers



In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 29.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 62.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 65.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [4]:
!pwd

/content/drive/MyDrive/ML_projects_work/AlephBert


## Importing Dependencies

In [5]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 
import tensorflow as tf
import transformers
from transformers import logging
logging.set_verbosity_error()
import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis (EDA) and the basic pre-process

In [6]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 5
# Labels in our dataset.
labels = ["negative", "positive"]

In [7]:
train_df = pd.read_csv('train_data.csv')

# =================for validation data ========================
# val_df = pd.read_csv("val data frame name here")

In [8]:
train_df.head(10)
#valid_df.head()

Unnamed: 0,sentence1,sentence2,similarity
0,מה אתה יודע לעשות?,מה הפעולות שאתה יודע לעשות?,positive
1,מה את יודעת לעשות?,מה הפעולות שאתה יודע לעשות?,positive
2,מה הפעולות שאתה יודע לעשות?,מה הפעולות שאתה יודע לעשות?,positive
3,מה הפעולות שאת יודעת לעשות?,מה הפעולות שאתה יודע לעשות?,positive
4,מה אתה יודע לבצע?,מה הפעולות שאתה יודע לעשות?,positive
5,מה את יודעת לבצע?,מה הפעולות שאתה יודע לעשות?,positive
6,מה הפעולות שאתה יודע לבצע?,מה הפעולות שאתה יודע לעשות?,positive
7,מה הפעולות שאת יודעת לבצע?,מה הפעולות שאתה יודע לעשות?,positive
8,איזה פעולות אתה יודע לבצע?,מה הפעולות שאתה יודע לעשות?,positive
9,איזה פעולות את יודעת לבצע?,מה הפעולות שאתה יודע לעשות?,positive


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4356 entries, 0 to 4355
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentence1   4356 non-null   object
 1   sentence2   4356 non-null   object
 2   similarity  4356 non-null   object
dtypes: object(3)
memory usage: 102.2+ KB


In [10]:
print(f"Sentence1: {train_df.loc[1, 'sentence1']}")
print(f"Sentence2: {train_df.loc[1, 'sentence2']}")
print(f"Similarity: {train_df.loc[1, 'similarity']}")

Sentence1: מה את יודעת לעשות?
Sentence2: מה הפעולות שאתה יודע לעשות?
Similarity: positive


In [11]:
print("Number of missing values")
print(train_df.isnull().sum())

Number of missing values
sentence1     0
sentence2     0
similarity    0
dtype: int64


In [12]:
# Distribution of the targets
# For better and optimal results, all the classes of the data must be equally distributed
# For uniform distribution of the data, the model is un-biased and the results are fair
print("Train Target Distribution")
print(train_df.similarity.value_counts())

Train Target Distribution
negative    3906
positive     450
Name: similarity, dtype: int64


In [13]:
# Adding a new columns for the for the labels
# We have two categories -> 0 for the negatives and 1 for the positive
train_df["label"] = train_df["similarity"].apply(
    lambda x: 0 if x == "negative" else 1
)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=2)


### For Validation data

In [None]:
####### ============= uncomment below portion for validation data

# val_df["label"] = val_df["similarity"].apply(
#     lambda x: 0 if x == "negative" else 1
# )
# y_val = tf.keras.utils.to_categorical(val_df.label, num_classes=2)

In [14]:
train_df.head()

Unnamed: 0,sentence1,sentence2,similarity,label
0,מה אתה יודע לעשות?,מה הפעולות שאתה יודע לעשות?,positive,1
1,מה את יודעת לעשות?,מה הפעולות שאתה יודע לעשות?,positive,1
2,מה הפעולות שאתה יודע לעשות?,מה הפעולות שאתה יודע לעשות?,positive,1
3,מה הפעולות שאת יודעת לעשות?,מה הפעולות שאתה יודע לעשות?,positive,1
4,מה אתה יודע לבצע?,מה הפעולות שאתה יודע לעשות?,positive,1


### Custom Data Generator

In [15]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets

        # Load our BERT Tokenizer to encode the text.
        # We will use onlplab/alephbert-base pretrained model.
        self.tokenizer = transformers.BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

## Build Model

In [16]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained('onlplab/alephbert-base')
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model.bert(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

Downloading:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666M [00:00<?, ?B/s]

Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7f699b5fe990>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  125976576   ['input_ids[0][0]',        

### Training Data Generator

In [17]:
train_data = BertSemanticDataGenerator(
    train_df[["sentence1", "sentence2"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Validation Data Generator

In [18]:
################## Uncomment this portion for the validation

# val_data = BertSemanticDataGenerator(
#     val_df[["sentence1", "sentence2"]].values.astype("str"),
#     y_val,
#     batch_size=batch_size,
#     shuffle=False,
# )

### Feature extraction:

All the other layers are freezed, only top layers are trained to do the "feature extraction", We can fine-tune the model after this step

In [19]:
##### uncomment the validation data for the validation dataset
history = model.fit(
    train_data,
    # validation_data = val_data,
    epochs=epochs, 
    use_multiprocessing=True,
    workers=-1,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Fine Tuning the Aleph-bert Model

In [20]:
# Unfreeze the bert_model.
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  125976576   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]',    

In [21]:
############# Uncomment the validation data_data for the validation dataset
history = model.fit(
    train_data,
    # validation_data = val_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Inference on Custom Dataset

In [22]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

    proba = model.predict(test_data[0])[0]
    print("All probass",proba)
    idx = np.argmax(proba)
    print(idx)
    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]
    return pred, proba

In [23]:
# sentences are positive
sentence1 = "מה אתה יודע לעשות?"
sentence2 = "מה הפעולות שאתה יודע לעשות?"
check_similarity(sentence1, sentence2)

All probass [4.093462e-04 9.995907e-01]
1


('positive', ' 1.00%')

In [24]:
# sentences are negative
sentence1 = "איך אתה מרגיש?"
sentence2 = "מה הביצועים שאת תומכת בהן?"
check_similarity(sentence1, sentence2)

All probass [0.99894804 0.00105197]
0


('negative', ' 1.00%')

In [25]:
# sentences are negative
sentence1 = "אני חשה בודדה בעולם"
sentence2 =  "איזה חג אתה אוהב לחגוג?"
check_similarity(sentence1, sentence2)

All probass [9.9999917e-01 8.8519914e-07]
0


('negative', ' 1.00%')

In [26]:
# sentences are positive
sentence1 = "תספרי לי איזו פעולות את יודעת לבצע?"
sentence2 = "מה הפעולות שאתה יודע לעשות?"
check_similarity(sentence1, sentence2)

All probass [3.2900242e-04 9.9967098e-01]
1


('positive', ' 1.00%')

In [27]:
# sentences are negative
sentence1 = "איזה פעולות את יודעת לבצע?"
sentence2 =  "איך אתה מרגיש?"
check_similarity(sentence1, sentence2)

All probass [9.9992132e-01 7.8694575e-05]
0


('negative', ' 1.00%')

## Saving the Trained Model along with the Architecture

In [28]:
model.save('alephbert_finetuned_model')

