In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/ML_projects_work/AlephBert

/content/drive/MyDrive/ML_projects_work/AlephBert


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [4]:
import numpy as np
import keras
import pandas as pd
pd.options.mode.chained_assignment = None 
import tensorflow as tf
import transformers
from transformers import logging
logging.set_verbosity_error()
import warnings
warnings.filterwarnings('ignore')

In [None]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
bert_base = transformers.TFBertModel.from_pretrained('onlplab/alephbert-base')

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/545k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666M [00:00<?, ?B/s]

In [None]:
model = tf.keras.models.load_model('alephbert_finetuned_model',custom_objects={'TFBertModel':bert_base},compile = False)

In [None]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
# epochs = 2

# Labels in our dataset.
labels = ["negative", "positive"]

In [None]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use onlplab/alephbert-base pretrained model.
        
        self.tokenizer = transformers.BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [None]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )
    proba = model.predict(test_data[0])[0]
    idx = np.argmax(proba)
    pred = labels[idx]
    return pred, proba

In [None]:
# sentences are positive
sentence1 = "מה אתה יודע לעשות?"
sentence2 = "מה הפעולות שאתה יודע לעשות?"
check_similarity(sentence1, sentence2)

InvalidArgumentError: ignored

In [None]:
default_list = pd.read_csv('default_sentence_list.csv')
default_list.head()

# Comparing user sentence with the reference sentences

In [None]:
def reference_similarity(user_sentence,threshold,default_list):
  all_score = []
  most_similar_sentence = ""
  similarity_type = 0
  
  # comparing reference sentence with default list
  for idx in range(len(default_list)):
    reference_sentence = default_list[idx]
    all_score.append(check_similarity(user_sentence, reference_sentence)[1][1])

  # checking the maximum score and return sentence for maximum score even if belos threshold
  max_val = max(all_score)
  item_idx = np.where(all_score==max_val)
  item_idx = item_idx[0][0]
  most_similar_sentence = default_list[item_idx]

  
  if max_val >= threshold:
    similarity_type = 1   # to check if the score is above the threshold


  return (most_similar_sentence, similarity_type, max_val,)

In [None]:
user_sentence = "מה אתה יודע לעשות?"
threshold = 0.8
default_lst = default_list['default sentence list']
most_sim_sentence = reference_similarity(user_sentence, threshold, default_lst)
print(most_sim_sentence)

('מה הפעולות שאתה יודע לעשות?', 1, 0.9995907)


In [None]:
!pip install session_info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting session_info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting stdlib_list
  Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 2.3 MB/s 
[?25hBuilding wheels for collected packages: session-info
  Building wheel for session-info (setup.py) ... [?25l[?25hdone
  Created wheel for session-info: filename=session_info-1.0.0-py3-none-any.whl size=8048 sha256=4cf0b64dfbf7af123703f79f98dd45ccdacf0a7508a76fda8c2c60cbf38b0927
  Stored in directory: /root/.cache/pip/wheels/bd/ad/14/6a42359351a18337a8683854cfbba99dd782271f2d1767f87f
Successfully built session-info
Installing collected packages: stdlib-list, session-info
Successfully installed session-info-1.0.0 stdlib-list-0.8.0


In [None]:
import session_info
session_info.show()

In [25]:
import numpy as np
import keras
import tensorflow as tf
import transformers
import os



# Scoring Script for the model
def init():
    global model
    global tokenizer
    # Get the path to the registered model file and load it
    # model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model.pkl')
    tokenizer = transformers.BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
    bert_base = transformers.TFBertModel.from_pretrained('onlplab/alephbert-base')
    model = tf.keras.models.load_model('alephbert_finetuned_model',custom_objects={'TFBertModel':bert_base},compile = False)
    

# Called when a request is received
def run(raw_data):
    # raw_data = json.loads(raw_data)['data']
    max_length = 128  
    batch_size = 32
    labels = ["negative", "positive"]


###===========================================================================
    class BertSemanticDataGenerator(tf.keras.utils.Sequence):
      def __init__(
          self,
          sentence_pairs,
          labels,
          batch_size=batch_size,
          shuffle=True,
          include_targets=True,
      ):
          self.sentence_pairs = sentence_pairs
          self.labels = labels
          self.shuffle = shuffle
          self.batch_size = batch_size
          self.include_targets = include_targets
          # Load our BERT Tokenizer to encode the text.
          # We will use onlplab/alephbert-base pretrained model.
          
          self.tokenizer = transformers.BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
          self.indexes = np.arange(len(self.sentence_pairs))
          self.on_epoch_end()

      def __len__(self):
          # Denotes the number of batches per epoch.
          return len(self.sentence_pairs) // self.batch_size

      def __getitem__(self, idx):
          # Retrieves the batch of index.
          indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
          sentence_pairs = self.sentence_pairs[indexes]

          # With BERT tokenizer's batch_encode_plus batch of both the sentences are
          # encoded together and separated by [SEP] token.
          encoded = self.tokenizer.batch_encode_plus(
              sentence_pairs.tolist(),
              add_special_tokens=True,
              max_length=max_length,
              return_attention_mask=True,
              return_token_type_ids=True,
              pad_to_max_length=True,
              return_tensors="tf",
          )

          # Convert batch of encoded features to numpy array.
          input_ids = np.array(encoded["input_ids"], dtype="int32")
          attention_masks = np.array(encoded["attention_mask"], dtype="int32")
          token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

          # Set to true if data generator is used for training/validation.
          if self.include_targets:
              labels = np.array(self.labels[indexes], dtype="int32")
              return [input_ids, attention_masks, token_type_ids], labels
          else:
              return [input_ids, attention_masks, token_type_ids]

      def on_epoch_end(self):
          # Shuffle indexes after each epoch if shuffle is set to True.
          if self.shuffle:
              np.random.RandomState(42).shuffle(self.indexes)
###------------------------------------------------------------------------------------------


###=============================================================================================
    def check_similarity(sentence1, sentence2):
      sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
      test_data = BertSemanticDataGenerator(
          sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
      )
      proba = model.predict(test_data[0])[0]
      idx = np.argmax(proba)
      pred = labels[idx]
      return pred, proba
###----------------------------------------------------------------------------------------------




###==============================================================================================
    def reference_similarity(user_sentence,threshold,default_list):
      all_score = []
      most_similar_sentence = ""
      similarity_type = 0
      
      # comparing reference sentence with default list
      for idx in range(len(default_list)):
        reference_sentence = default_list[idx]
        all_score.append(check_similarity(user_sentence, reference_sentence)[1][1])

      # checking the maximum score and return sentence for maximum score even if belos threshold
      max_val = max(all_score)
      item_idx = np.where(all_score==max_val)
      item_idx = item_idx[0][0]
      most_similar_sentence = default_list[item_idx]

      
      if max_val >= threshold:
        similarity_type = 1   # to check if the score is above the threshold


        return [most_similar_sentence, similarity_type, max_val]
  ###----------------------------------------------------------------------------------------------


  # ========================= default list==============================

    default_list = [
          'מה הפעולות שאתה יודע לעשות?',
          'איך אתה מרגיש?',
          'האם עשית משהו מעניין היום?',
          'אני מרגיש טוב',
          'בגיל המבוגר חשוב במיוחד לשמור על הבריאות ולאמץ אורח חיים בריא',
          'מה קרה לך?',
          'על מה אתה מדבר?',
          'מה התחביבים שלך?',
          'מה תרצה ללמוד?',
          'איך פוגשים ומכירים חברים חדשים?',
          'מתי ביקרת אצל הרופא?',
          'מה קראת לאחרונה?',
          'תספר לי על המשפחה שלך',
          'במה עבדת?',
          'אילו בעלי חיים אתה אוהב?',
          'מה אתה אוהב לאכול?',
          'איזו מוזיקה אתה אוהב?',
          'איפה טיילת מחוץ לארץ ?',
          'מה זכרונות ילדות שלך?',
          'איזה חג אתה אוהב לחגוג?',
          'איפה אתה מבלה בטבע?',
          'איפה ביקרת במדינה?',
          'איך אתה מסתדר עם מזג האוויר?',
          'מה המשמעות של השם שלך?',
          'מה היא לדעתך הזדקנות מוצלחת?'
      ]

      ###----------------------------------------------------------------------------------------------



    user_sentence = raw_data[0]
    threshold = raw_data[1]
    most_sim_sentence = reference_similarity(user_sentence, threshold, default_list)

    return [most_sim_sentence]
    # tolist() is written for conversion to the json

In [26]:
# default_list = pd.read_csv('default_sentence_list.csv')
user_sentence = "מה אתה יודע לעשות?"
threshold = 0.8
# default_lst = default_list['default sentence list']
raw_data = (user_sentence,threshold)
init()
most_sim_sentence = run(raw_data)
print(most_sim_sentence)

[['מה הפעולות שאתה יודע לעשות?', 1, 0.9995907]]


In [18]:
default_list = pd.read_csv('default_sentence_list.csv')
default_list

Unnamed: 0,default sentence list
0,מה הפעולות שאתה יודע לעשות?
1,איך אתה מרגיש?
2,האם עשית משהו מעניין היום?
3,אני מרגיש טוב
4,בגיל המבוגר חשוב במיוחד לשמור על הבריאות ולאמץ...
5,מה קרה לך?
6,על מה אתה מדבר?
7,מה התחביבים שלך?
8,מה תרצה ללמוד?
9,איך פוגשים ומכירים חברים חדשים?


In [None]:
import requests
import json

# An array of new data cases


# Convert the array to a serializable list in a JSON document
# json_data = json.dumps({"data": row_data})

# Set the content type in the request headers
request_headers = { 'Content-Type':'application/json' }

# Call the service
endpoint = 'https://aleph-bert-endpoint.eastus2.inference.ml.azure.com/score'
response = requests.post(url = endpoint,
                         data = row_data,
                         headers = request_headers)

# Get the predictions from the JSON response
predictions = json.loads(response.json())
print(predictions)

ValueError: ignored