In [None]:
from sklearn.model_selection import train_test_split
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import matplotlib.pyplot as plt
from plotly.offline import iplot
from tqdm import tqdm
import seaborn
import pandas as pd
import os

In [None]:
print(transformers.__version__)

In [5]:
root_path = '../bbc-text.csv'

In [None]:
df = pd.read_csv(root_path)
df.head()


In [None]:
df.shape

In [None]:
df['category'].unique()

In [None]:
df['encoded_text'] = df['category'].astype('category').cat.codes

df.head(10)

In [None]:
log_dir = "logs/fit"

if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
# if no improvment in 10 epoch then restore the best weights
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

### Fine-tuning

In [11]:
from transformers import TFDistilBertForSequenceClassification, AutoTokenizer 
import tensorflow as tf

In [None]:
# Load tokenizer and model
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=5)

In [None]:
model.summary()

In [None]:
model.layers

In [15]:
data_texts = df['text'].to_list()
data_labels = df['encoded_text'].to_list()

train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0 )
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size = 0.01, random_state = 0 )

In [16]:
# Define the input pipelines
max_length = 128
batch_size = 32

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=max_length)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(batch_size)

In [17]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()

In [18]:
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [None]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[tensorboard_callback, early_stopping])

model.save_pretrained("./results")

### plotting Loss and accuracy

In [None]:
# Plot the training and validation loss
plt.figure(figsize=(12, 6))

# Loss Plot
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')

# Accuracy Plot
plt.subplot(1, 2, 2)
plt.plot(history.history['sparse_categorical_accuracy'], label='Training Accuracy')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

In [None]:
tokenizer.save_pretrained("./results")

In [None]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print("Test Loss", test_loss)
print("Test Accuracy", test_accuracy)

### Loading the pretrained model

In [None]:
# Load tokenizer and model
tokenizer_fine_tuned = AutoTokenizer.from_pretrained("./results")
model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained("./results")


In [24]:
test_text = test_texts[5]

In [None]:
test_text

In [26]:
encoding = tokenizer_fine_tuned(test_text, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [27]:
logits = model_fine_tuned(**encoding).logits

In [None]:
predicted_class = tf.argmax(logits, axis=-1).numpy()[0]
print("predicted_class:", predicted_class)

In [None]:
test_labels[5]