In [3]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

import matplotlib.pyplot as plt
import scipy.io
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from sklearn.manifold import TSNE
import seaborn as sns

import pandas as pd
import xlsxwriter
import numpy as np

tf.get_logger().setLevel('ERROR')

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
loaded_model = tf.keras.models.load_model('models\model-improvement-34-0.98_noDense768_0.05Val')

intermediate_layer_model = tf.keras.Model(inputs=loaded_model.get_layer("text").input,
                                       outputs=loaded_model.get_layer("BERT_encoder").output)


intermediate_layer_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [5]:
data = pd.read_csv(r'C:\Users\MOHSEN-ASUS\JupyterProjects\bert classification\final_dataset_for_bert.csv')

In [6]:
data["class"].unique()

array(['Antelope', 'grizzly+bear', 'killer+whale', 'beaver', 'dalmatian',
       'persian+cat', 'horse', 'german+shepherd', 'blue+whale',
       'siamese+cat', 'skunk', 'mole', 'tiger', 'hippopotamus', 'leopard',
       'moose', 'spider+monkey', 'humpback+whale', 'elephant', 'gorilla',
       'ox', 'fox', 'sheep', 'seal', 'chimpanzee', 'hamster', 'squirrel',
       'rhinoceros', 'rabbit', 'bat', 'giraffe', 'wolf', 'chihuahua',
       'rat', 'weasel', 'otter', 'buffalo', 'zebra', 'giant+panda',
       'deer', 'bobcat', 'pig', 'lion', 'mouse', 'polar+bear', 'collie',
       'Walrus', 'raccoon', 'cow', 'dolphin'], dtype=object)

In [7]:
def get_BERTembbedings(text, model):
    
    text = [text]
    text_tensor = tf.convert_to_tensor(text, dtype=tf.string)
    model_output = model(text_tensor)
    
    return model_output["pooled_output"]

In [8]:
def embbedings_labels_generator(dataset, selected_classes, BERT, embbeding_size=768):
    
    # an array for labels
    labels = np.array([])
    
    # in the line below i am creating a zero array to append my embbedings to it (i dont know how to do it without creating a zero)
    class_texts_embbedings = np.zeros([1, embbeding_size])
    
    for i in selected_classes:
        
        # getting all of the texts belong to class i
        class_texts = dataset[dataset["class"] == i]
        class_texts = class_texts["text"]
        
        for j in class_texts:
            
            # obtaining embbedings of class i texts using "get_BERTembbedings" and appending them to an array
            class_texts_embbedings = np.append(class_texts_embbedings, get_BERTembbedings(j, BERT), axis=0)
            
            # putting labels in labels array
            labels = np.append(labels, i)  
            
        
    # deleting the zero array from the first row
    class_texts_embbedings = np.delete(class_texts_embbedings, 0, 0)
    
    return class_texts_embbedings, labels

In [9]:
class_name = ["Antelope"]

my_embbedings, my_labels = embbedings_labels_generator(data, class_name, intermediate_layer_model)

In [10]:
km = KMeans(n_clusters=3)
cluster_prediction = km.fit_predict(my_embbedings)
cluster_prediction

array([2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2,
       2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       2, 2, 2, 1, 2, 2, 0, 0, 2, 2, 0, 2, 2, 1, 0])

In [11]:
def att_generator(embbedings, num_clusters=3):
    
    #clustering embbedings
    km = KMeans(n_clusters=num_clusters)
    cluster_prediction = km.fit_predict(my_embbedings)
    
    #finding the dominant cluster
    counts = np.bincount(cluster_prediction)
    dominant_cluster = np.argmax(counts)
    
    #convert to dataframe for ease of use
    df = pd.DataFrame(embbedings)
    
    #include the cluster labels
    df["clusters"] = cluster_prediction
    
    #choosing the dominant cluster ebbedings & calculate the mean of them to produce the attribute
    dominant_embbedings = df[df["clusters"] == dominant_cluster]
    dominant_embbedings = dominant_embbedings.drop(["clusters"], axis=1)
    dominant_embbedings_nparray = dominant_embbedings.to_numpy()
    the_attribute = np.mean(dominant_embbedings_nparray, axis=0)
    
    return the_attribute

In [12]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [13]:
myatt= att_generator(my_embbedings)
myatt.shape

(768,)

In [14]:
myatt = myatt.reshape(1, 768)
attribute_array = np.zeros([1, 768])
asghar = np.append(attribute_array, myatt, axis=0)

asghar.shape

(2, 768)

In [15]:
all_classes = data["class"].unique()
attribute_array = np.zeros([1, 768])

for i in all_classes:
    
    my_embbedings, my_labels = embbedings_labels_generator(data, [i], intermediate_layer_model)
    myatt = att_generator(my_embbedings)
    myatt = myatt.reshape(1, 768)
    
    attribute_array = np.append(attribute_array, myatt, axis=0)

attribute_array = np.delete(attribute_array, 0, 0)
attribute_array = attribute_array.transpose()

In [16]:
attribute_array.shape

(768, 50)

In [17]:
the_mat_file = scipy.io.loadmat('att_splits_AWA2.mat')
the_mat_file["att"] = attribute_array

print(the_mat_file["att"].shape)

(768, 50)


In [18]:
scipy.io.savemat('myatt_splits768_0.05Val.mat', the_mat_file)

In [19]:
the_mat_file = scipy.io.loadmat('myatt_splits768_0.05Val.mat')
the_mat_file["att"] = NormalizeData(the_mat_file["att"])

scipy.io.savemat('myatt_splits768norm_0.05Val.mat', the_mat_file)