In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten
from transformers import DistilBertTokenizer, TFDistilBertModel, TFBertModel, BertTokenizer, AutoModel, AutoTokenizer

from sklearn.metrics import classification_report

2022-07-19 15:10:09.681994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-19 15:10:10.709048: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-19 15:10:10.709432: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-19 15:10:10.734884: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

# Loading and preprocessing data

In [2]:
path = '/home/petar/Fakultet/Semester 7/NLP/Datasets/fake_news/'

In [3]:
train = pd.read_csv(path + 'full_train_df.csv', index_col=0)
test = pd.read_csv(path + 'full_test_df.csv', index_col=0)

In [4]:
x_train = train['tweet'].copy()
y_train = train['label'].copy()

x_test = test['tweet'].copy()
y_test = test['label'].copy()

In [15]:
test_input_ids, test_attention_masks, test_outputs = [], [], []

In [9]:
bt = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:
for sentence, label in zip(x_test, y_test):
    
    sentence_tokens = bt.encode_plus(sentence, max_length=240, padding='max_length', truncation=True)
    
    test_input_ids.append(sentence_tokens['input_ids'])
    test_attention_masks.append(sentence_tokens['attention_mask'])
    test_outputs.append(label)

# Trial

# Predicting on an unsaved integrated model

In [6]:
bt = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [25]:
distilbert_base_uncased = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

input_ids = Input(shape=(100,), name='input_ids', dtype='int32')
att_masks = Input(shape=(100,), name='masked_tokens', dtype='int32')

bert_in = distilbert_base_uncased(input_ids, attention_mask=att_masks)[0]

dense_1 = Dense(8, name='dense_1', activation=tf.keras.layers.LeakyReLU(alpha=.1))(bert_in)

flatten = Flatten()(dense_1)

out = Dense(1, name='output', activation='sigmoid')(flatten)

distilbert_base_model2 = Model(inputs=[input_ids, att_masks], outputs=[out])

distilbert_base_model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_eagerly=True)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_projector', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [26]:
distilbert_base_model2.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 masked_tokens (InputLayer)     [(None, 100)]        0           []                               
                                                                                                  
 tf_distil_bert_model_4 (TFDist  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'masked_tokens[0][0]']          
                                one, 100, 768),                                                   
                                 hidden_states=None                                         

In [41]:
tmp_test_input_ids = test_input_ids[:20]
tmp_test_attention_masks = test_attention_masks[:20]

In [26]:
pred = distilbert_base_model2.predict(x=[np.array(tmp_test_input_ids), np.array(tmp_test_attention_masks)])



In [29]:
tmp_test_outputs = test_outputs[:20]

In [31]:
pred = pred.round()

In [37]:
print(classification_report(pred, tmp_test_outputs))

              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79        13
         1.0       0.00      0.00      0.00         7

    accuracy                           0.65        20
   macro avg       0.33      0.50      0.39        20
weighted avg       0.42      0.65      0.51        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Predicting on a saved integrated model

In [11]:
bt.save_pretrained('./example_save/')
distilbert_base_model2.save('./example_save/')





INFO:tensorflow:Assets written to: ./example_save/assets


INFO:tensorflow:Assets written to: ./example_save/assets


In [23]:
tf.keras.models.save_model(distilbert_base_model2, './example_save/')

























INFO:tensorflow:Assets written to: ./example_save/assets


INFO:tensorflow:Assets written to: ./example_save/assets


In [31]:
tf.saved_model.save(distilbert_base_model2, './example_save/')

























INFO:tensorflow:Assets written to: ./example_save/assets


INFO:tensorflow:Assets written to: ./example_save/assets


In [37]:
distilbert_base_model2.save('./example_save/distilbert_base_model2.h5')

In [39]:
model = tf.keras.models.load_model('./example_save/distilbert_base_model2.h5', custom_objects={'TFDistilBertModel': TFDistilBertModel})

In [42]:
model.predict(x=[np.array(tmp_test_input_ids), np.array(tmp_test_attention_masks)])



array([[0.32461348],
       [0.25297165],
       [0.35704926],
       [0.26863098],
       [0.3177724 ],
       [0.24089561],
       [0.33566555],
       [0.22709437],
       [0.2577569 ],
       [0.28885096],
       [0.25422773],
       [0.22825019],
       [0.37697196],
       [0.29535285],
       [0.31589705],
       [0.20709507],
       [0.18448254],
       [0.3311483 ],
       [0.27977544],
       [0.21882963]], dtype=float32)

# Modelo mora da e zacuvan kako h5 fajl, i koga ke se loadne, mora da se predatat site klasi od transformers kako custom_object vo dict

# Loading pretrained models

In [5]:
bert = tf.keras.models.load_model('./tmp/all/BERT_base', custom_objects={'TFBertModel': TFBertModel})



In [17]:
bert.predict(x=[np.array(test_input_ids[:100]), np.array(test_attention_masks[:100])])



array([[0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.68253124],
       [0.682