In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

sms = pd.read_csv("preprocessed_sms.csv", encoding='latin-1')

sms['label'] = sms['label'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(sms['message'], sms['label'], test_size=0.3, random_state=42)

In [17]:
# Токенізація тексту за допомогою BERT Tokenizer
from transformers import BertTokenizer

# Завантаження токенайзера BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Токенізація тексту
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

In [21]:
import tensorflow as tf

# Створення тензорів для міток
train_labels = tf.convert_to_tensor(y_train.values)
test_labels = tf.convert_to_tensor(y_test.values)

# Створення TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

In [22]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Завантаження моделі BERT для класифікації
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Налаштування оптимізатора
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)

# Визначення функції втрат
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Компіляція моделі
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Навчання моделі
history = model.fit(train_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
model_filename = f'bert_sms_classifier_{timestamp}'

model.save(f'./models/{model_filename}')
model.save(f'./models/{model_filename}.h5')

INFO:tensorflow:Assets written to: ./models/bert_sms_classifier_20240613-001212/assets


INFO:tensorflow:Assets written to: ./models/bert_sms_classifier_20240613-001212/assets
  saving_api.save_model(


NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [26]:
loaded_model = tf.keras.models.load_model(f'./models/{model_filename}')

# Або завантаження моделі у форматі HDF5
# loaded_model = tf.keras.models.load_model(f'./models/{model_filename}.h5')


In [27]:
from sklearn.metrics import accuracy_score

# Прогнозування на тестових даних
predictions = loaded_model.predict(test_dataset)



AttributeError: 'dict' object has no attribute 'logits'

In [29]:
pred_labels = tf.argmax(predictions['logits'], axis=1).numpy()

accuracy = accuracy_score(y_test, pred_labels)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9946172248803827


In [34]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred_labels, target_names=['ham', 'spam']))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      1453
        spam       0.99      0.97      0.98       219

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.99      1672
weighted avg       0.99      0.99      0.99      1672



In [33]:
import gradio as gr

def classify_message(message):
    inputs = tokenizer(message, return_tensors="tf", truncation=True, padding=True, max_length=128)
    test_predictions = loaded_model(inputs)
    test_logits = test_predictions['logits']
    probabilities = tf.nn.softmax(test_logits, axis=1).numpy()
    pred_label = tf.argmax(probabilities, axis=1).numpy()
    # label = 'spam' if pred_label[0] == 1 else 'ham'
    return {'Ham': probabilities[0][0],
            'Spam': probabilities[0][1]}
    # return f"Label: {label}\nHam Probability: {probabilities[0][0]:.4f}\nSpam Probability: {probabilities[0][1]:.4f}"

interface = gr.Interface(
    fn=classify_message,
    inputs=gr.Textbox(lines=2, placeholder="Enter SMS message..."),
    outputs=gr.Textbox(),
    title="SMS Ham/Spam Classifier",
    description="Enter an SMS message to classify it as Ham or Spam."
)

interface.launch(debug=True, inbrowser=True)

Running on local URL:  http://127.0.0.1:7860

Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


