In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

import matplotlib.pyplot as plt
import scipy.io
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

from sklearn.manifold import TSNE
import seaborn as sns

import pandas as pd
import xlsxwriter
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

tf.get_logger().setLevel('ERROR')

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
#read the data into a pandas dataframe
df = pd.read_csv("bert_data_withoutClassWord.csv")
print(df.shape)
df.head(5)

(7115, 3)


Unnamed: 0.1,Unnamed: 0,text,class
0,0,this article is about the herbivorous mammals....,Antelope
1,1,"one new world species, the pronghorn of north ...",Antelope
2,2,"the english word ""animal"" first appeared in 14...",Antelope
3,3,"the word talopus and calopus, from latin, came...",Antelope
4,4,animal are not a cladistic or taxonomically de...,Antelope


In [3]:
#Add the new column which gives a unique number to each of these labels 

j = 0
for i in df['class'].unique():
    df.loc[df['class'] == i, ['class_num']] = j
    j += 1

#checking the results 
df.head(50000)

Unnamed: 0.1,Unnamed: 0,text,class,class_num
0,0,this article is about the herbivorous mammals....,Antelope,0.0
1,1,"one new world species, the pronghorn of north ...",Antelope,0.0
2,2,"the english word ""animal"" first appeared in 14...",Antelope,0.0
3,3,"the word talopus and calopus, from latin, came...",Antelope,0.0
4,4,animal are not a cladistic or taxonomically de...,Antelope,0.0
...,...,...,...,...
7110,7110,technology to use sponges as mouth protection ...,dolphin,49.0
7111,7111,"pesticides, heavy metals, plastics, and other ...",dolphin,49.0
7112,7112,"hundreds of orcas, animals and other members o...",dolphin,49.0
7113,7113,captured orcas and animals are confined to tan...,dolphin,49.0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.text, 
    df.class_num, 
    test_size=0.1, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.class_num
)

In [12]:
embbeding_size = 8000
clustering_count = 3
numof_embb = 2
att_size = embbeding_size * clustering_count

In [13]:
acc = 0

while 0.535 > acc:
    
    vectorizer = TfidfVectorizer(max_features = embbeding_size).fit(X_train)
    
    len(vectorizer.get_feature_names_out())
    
    X_train_vectorized = vectorizer.transform(X_train)
    
    clf = RandomForestClassifier()
    clf.fit(X_train_vectorized, y_train)
    
    predictions = clf.predict(vectorizer.transform(X_test))
    acc = accuracy_score(y_test, predictions)
    print("ACC:", accuracy_score(y_test, predictions))

ACC: 0.5140449438202247
ACC: 0.5252808988764045
ACC: 0.5154494382022472
ACC: 0.5042134831460674
ACC: 0.5182584269662921
ACC: 0.5407303370786517


In [14]:
# vectorizer.transform() is our emb creator

all_atts = vectorizer.transform(df.text).toarray()
print(type(all_atts))
print(all_atts[0])
all_labels = df['class']

<class 'numpy.ndarray'>
[0. 0. 0. ... 0. 0. 0.]


In [15]:
all_atts[0].shape

(8000,)

In [16]:
#print(all_atts)
data = pd.DataFrame(all_atts)

data['labels'] = all_labels
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin


In [17]:
def att_generator(embbedings, numof_embb=numof_embb, num_clusters=clustering_count):

    #clustering embbedings
    km = KMeans(n_clusters=num_clusters)
    clustering_predictions = km.fit_predict(embbedings)
    
    #convert to dataframe for ease of use
    df = pd.DataFrame(embbedings)
    
    #include the cluster labels
    df["clusters"] = clustering_predictions
    
    #creating the attribute array
    the_attribute = np.array([])
    
    for i in np.unique(clustering_predictions):
        
        cluster_embbedings = df[df["clusters"] == i]
        cluster_embbedings = cluster_embbedings.drop(["clusters"], axis=1) 
        cluster_embbedings_nparray = cluster_embbedings.to_numpy()
        
        cluster_mean = np.mean(cluster_embbedings_nparray, axis=0)
        emb_distances = np.array([])
        
        #calculating each embbeding distance from its cluster mean and selecting the closest one to the mean
        for i in range(cluster_embbedings_nparray.shape[0]):
            
            dist = np.linalg.norm(cluster_mean - cluster_embbedings_nparray[i])
            emb_distances = np.append(emb_distances, dist)
            
            
        
        emb_distances = np.reshape(emb_distances, (-1, 1))
        #print(cluster_embbedings_nparray.shape, emb_distances.shape)
        temp = np.concatenate((cluster_embbedings_nparray, emb_distances), axis=1)
        #print(temp.shape)
        temp = temp[temp[:, -1].argsort()]
        #print(temp[:, -1])
        sorted_embbedings = np.delete(temp, -1, 1)
        #print(sorted_embbedings.shape)
        
        the_attribute = np.concatenate((the_attribute, sorted_embbedings[0]), axis=0)
        print(the_attribute.shape)

    
    return the_attribute

In [18]:
unique_classes = data["labels"].unique()
attribute_array = np.zeros([1, att_size])

for i in unique_classes:
    
    class_embbedings = data.loc[data['labels'] == i]
    class_embbedings = class_embbedings.drop(['labels'], axis=1)    

    myatt = att_generator(class_embbedings)
    myatt = myatt.reshape(1, att_size)
    
    attribute_array = np.append(attribute_array, myatt, axis=0)

attribute_array = np.delete(attribute_array, 0, 0)
attribute_array = attribute_array.transpose()



(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)
(8000,)




(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)
(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)




(8000,)
(16000,)
(24000,)


In [19]:
the_mat_file = scipy.io.loadmat('att_splits_AWA2.mat')
the_mat_file["att"] = attribute_array

print(the_mat_file["att"].shape)

(24000, 50)


In [20]:
scipy.io.savemat('tf-idf_8000_embClust(3,1).mat', the_mat_file)

In [214]:
#data = data.rename(columns={"0": "embbedings"})
#data
data.loc[data['labels'] == 'Antelope']

Unnamed: 0,embbedings,labels
0,"(0, 3815)\t0.2116568711329839\n (0, 3798)\t...",Antelope
1,"(0, 3972)\t0.190382191693978\n (0, 3725)\t0...",Antelope
2,"(0, 3966)\t0.36810011017665584\n (0, 3925)\...",Antelope
3,"(0, 3966)\t0.189929209485221\n (0, 3878)\t0...",Antelope
4,"(0, 3954)\t0.1671060571283198\n (0, 3758)\t...",Antelope
...,...,...
185,"(0, 3796)\t0.15071770983014868\n (0, 3758)\...",Antelope
186,"(0, 3942)\t0.13666177470169633\n (0, 3595)\...",Antelope
187,"(0, 3883)\t0.16809701621305914\n (0, 3595)\...",Antelope
188,"(0, 3798)\t0.26935198636538843\n (0, 3747)\...",Antelope


# obtain att

In [163]:
data = pd.read_csv(r'bert_data_withoutClassWord.csv')

In [164]:
data['class']

0       Antelope
1       Antelope
2       Antelope
3       Antelope
4       Antelope
          ...   
7110     dolphin
7111     dolphin
7112     dolphin
7113     dolphin
7114     dolphin
Name: class, Length: 7115, dtype: object

In [134]:
the_mat_file = scipy.io.loadmat('att_splits_AWA2.mat')
the_mat_file['att'].shape

(85, 50)

# sum/mean

In [21]:
data = pd.DataFrame(all_atts)

data['labels'] = all_labels
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin


In [28]:
att_size4sum = embbeding_size

In [33]:
unique_classes = data["labels"].unique()
attribute_array = np.zeros([1, att_size4sum])

for i in unique_classes:
    
    class_embbedings = data.loc[data['labels'] == i]
    class_embbedings = class_embbedings.drop(['labels'], axis=1)    

    myatt = np.sum(class_embbedings, axis=0)
    myatt = myatt.to_numpy()
    myatt = myatt.reshape(1, att_size4sum)
    
    attribute_array = np.append(attribute_array, myatt, axis=0)

attribute_array = np.delete(attribute_array, 0, 0)
attribute_array = attribute_array.transpose()

In [34]:
attribute_array.shape

(8000, 50)

In [35]:
the_mat_file = scipy.io.loadmat('att_splits_AWA2.mat')
the_mat_file["att"] = attribute_array

print(the_mat_file["att"].shape)


scipy.io.savemat('tf-idf_8000_sum.mat', the_mat_file)

(8000, 50)


In [36]:
def NormalizeData(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

In [37]:
the_mat_file = scipy.io.loadmat('tf-idf_8000_sum.mat')
the_mat_file["att"] = NormalizeData(the_mat_file["att"])

scipy.io.savemat('tf-idf_8000norm_sum.mat', the_mat_file)

# C3

In [38]:
data = pd.DataFrame(all_atts)

data['labels'] = all_labels
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Antelope
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin
7113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dolphin


In [41]:
def att_generator(embbedings, num_clusters=3):
    
    #clustering embbedings
    km = KMeans(n_clusters=num_clusters)
    clustering_predictions = km.fit_predict(embbedings)
    
    #convert to dataframe for ease of use
    df = pd.DataFrame(embbedings)
    
    #include the cluster labels
    df["clusters"] = clustering_predictions
    
    #creating the attribute array
    the_attribute = np.array([])
    
    for i in np.unique(clustering_predictions):
        
        cluster_embbedings = df[df["clusters"] == i]
        cluster_embbedings = cluster_embbedings.drop(["clusters"], axis=1)
        cluster_embbedings_nparray = cluster_embbedings.to_numpy()
        
        attribute_part = np.mean(cluster_embbedings_nparray, axis=0)

        the_attribute = np.concatenate((the_attribute, attribute_part))


    return the_attribute

In [42]:
unique_classes = data["labels"].unique()
attribute_array = np.zeros([1, att_size])

for i in unique_classes:
    
    class_embbedings = data.loc[data['labels'] == i]
    class_embbedings = class_embbedings.drop(['labels'], axis=1)    

    myatt = att_generator(class_embbedings)
    myatt = myatt.reshape(1, att_size)
    
    attribute_array = np.append(attribute_array, myatt, axis=0)

attribute_array = np.delete(attribute_array, 0, 0)
attribute_array = attribute_array.transpose()





In [45]:
the_mat_file = scipy.io.loadmat('att_splits_AWA2.mat')
the_mat_file["att"] = attribute_array

print(the_mat_file["att"].shape)


scipy.io.savemat('tf-idf_8000_C3.mat', the_mat_file)

(24000, 50)


In [46]:
the_mat_file = scipy.io.loadmat('tf-idf_8000_C3.mat')
the_mat_file["att"] = NormalizeData(the_mat_file["att"])

scipy.io.savemat('tf-idf_8000norm_C3.mat', the_mat_file)